using System; using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; using System.Threading; using LuhnNet; using Newtonsoft.Json; //Do not remove - Required for JObject using Newtonsoft.Json.Linq; using Shinydocs.CognitiveToolkit.Scripting; using Shinydocs.CognitiveToolkit.Core.Tools.ScrollTools.RunScript; using Shinydocs.CognitiveToolkit.UI.CommandLine; using Shinydocs.CognitiveToolkit.UI.CommandLine.Progress; using Shinydocs.CognitiveToolkit.Utilities; //Do not remove - Required for JObject #pragma warning disable CS8603 #pragma warning disable CS8619 #pragma warning disable CS8604 #pragma warning disable CS8600 #pragma warning disable CS8601 #pragma warning disable CS8618 public class FlagFieldBasedOnRegexScript : IScript { // ScriptLogger allows you to log to the same index as the rest of the cognitive toolkit. // See https://github.com/serilog/serilog/wiki/Writing-Log-Events for message formatting information private readonly ScriptLogger _log = new ScriptLogger(); private static string ScriptName = "FlagFieldBasedOnRegexScript"; private static string Version = "2.6.0.1"; private string[] _fieldsToReturn = { "id", "path", "length" }; private string _indexName; private string _query; private string _serverUrl; public string _regexPattern; public string _tagValue; public Regex _regex; public string _searchField; public string _fieldName; public bool _validateWithLuhn; private RunScriptDocumentUpdater _documentUpdater; private int _threads; private int _nodesPerRequest; /// /// SetUp Called before executing run /// /// Array of command line arguments passed in, excluding the Script File Path ("-p") and Script Class Name ("-c") parameters used by the RunScript tool public void SetUp(string[] arguments){ _log.Information(string.Format("{0} version {1}", ScriptName, Version)); Console.WriteLine(string.Format("{0} version {1}", ScriptName, Version)); try { Dictionary options; // OptionsParser will parse an array of arguments into a dictionary of flags and values if (OptionsParser.TryParse(arguments, out options)) { OptionsParser.ParseStandardOptions(options, out _serverUrl, out _indexName, out _threads, out _nodesPerRequest); OptionsParser.ParseQueryOption(options, out _query); if (!options.TryGetValue("--regex-pattern", out _regexPattern)) OptionsParser.InputError("The regex (--regex-pattern) is a required parameter."); if (!options.TryGetValue("--value", out _tagValue)) OptionsParser.InputError("The value (--value) is a required parameter."); if (!options.TryGetValue("--field-name", out _fieldName)) OptionsParser.InputError("The field name (--field-name) is a required parameter."); if (!options.TryGetValue("--search-field", out _searchField)) { _searchField = "fullText"; } string validateLuhnString; if (options.TryGetValue("--valid-luhn", out validateLuhnString)) { bool.TryParse(validateLuhnString, out _validateWithLuhn); } _fieldsToReturn = _fieldsToReturn.Append(_searchField).Append(_fieldName).ToArray(); _regex = new Regex(_regexPattern); _documentUpdater = new RunScriptDocumentUpdater(_serverUrl, _indexName, _fieldsToReturn, _nodesPerRequest, _threads); } else { throw new ArgumentException("Failed to parse arguments. Tool requires exactly the following parameters -q (Query or Path to QueryFile) -u (Index Server Url) -i (IndexName)."); } } catch (Exception ex) { var message = ex.Message + "\n\nPlease review the following parameters:" + "\n" + "-q \tquery or path to queryfile" + "\n" + "-u \tindex server URL" + "\n" + "-i \tindex name" + "\n" + "--regex-pattern \tRegex pattern" + "\n" + "--value \tfield value" + "\n" + "--field-name \tfield name" + "\n" + "--search-field \tField to search (Default: fullText)" + "\n" + "--valid-luhn \tValidate using Luhn algorithm (Optional)" + "\n" + "--threads (Default: 1)" + "\n" + "--nodes-per-request (Default: 100)"; _log.Error(ex, message); Console.WriteLine(message); throw; } } public void Run() { var processedTotal = 0; var updatedTotal = 0; using (var progress = new CommandLineProgress()) { _documentUpdater.Progress = progress; _documentUpdater.Update(_query,document => { var id = document["id"]; try { Interlocked.Increment(ref processedTotal); if (PassesChecks(document)) { Interlocked.Increment(ref updatedTotal); return document.ToObject>(); } } catch (Exception ex) { _log.Error(string.Format("Error processing {0}.{1}", id, ex.Message), ex); } return new Dictionary(); }); } } public bool PassesChecks(JObject document) { var searchField = document[_searchField]; if (searchField == null) return false; var text = searchField.ToString(); if (MatchRegex(text)) { if (_validateWithLuhn) { if (!AllMatchesValid(text)) return false; } UpdateDocument(document); return true; } return false; } public void TearDown() { _log.Information(string.Format("Complete. Total items processed : {0}", RunScriptDocumentUpdater.TotalProcessed)); } public JObject UpdateDocument(JObject document) { var piiField = document[_fieldName] as JArray ?? new JArray(_tagValue); document.Remove(_searchField); if (!piiField.Any(d => string.Equals(d.ToString(), _tagValue, StringComparison.CurrentCultureIgnoreCase))) { piiField.Add(new JValue(_tagValue)); } document[_fieldName] = piiField; return document; } public bool MatchRegex(string text) { return !string.IsNullOrWhiteSpace(text) && _regex.IsMatch(text); } public bool AllMatchesValid(string text) { var matches = _regex.Matches(text).Cast().ToList(); return matches.Any() && matches.Any(m => Luhn.IsValid(m.Value)); } }