using System; using System.Collections.Generic; using System.Globalization; using System.IO; using System.Linq; using System.Security.Permissions; using System.Text.RegularExpressions; using System.Threading; using CsvHelper; using Newtonsoft.Json; //Do not remove - Required for JObject using Newtonsoft.Json.Linq; using Shinydocs.CognitiveToolkit.Core.Exceptions; //Do not remove - Required for JObject using Shinydocs.CognitiveToolkit.Scripting; using Shinydocs.CognitiveToolkit.Core.Tools.ScrollTools.RunScript; using Shinydocs.CognitiveToolkit.UI.CommandLine; using Shinydocs.CognitiveToolkit.UI.CommandLine.Progress; using Shinydocs.CognitiveToolkit.Utilities; #pragma warning disable CS8602 #pragma warning disable CS8600 #pragma warning disable CS8603 #pragma warning disable CS8601 #pragma warning disable CS8618 public class RegexEntityExtractor : IScript { private static string ScriptName = "RegexEntityExtractor"; private static string Version = "2.6.0.1"; private readonly ScriptLogger _log = new ScriptLogger(); private RunScriptDocumentUpdater _documentUpdater; public string _serverUrl; public string _indexName; public string _csvFile; public string _query; public string _regexColumn; public string _tagColumnName; public string _searchFields; public List _searchFieldsList; public List _records; private int _threads; private int _nodesPerRequest; public void SetUp(string[] arguments) { _log.Information(string.Format("{0} version {1}", ScriptName, Version)); Console.WriteLine(string.Format("{0} version {1}", ScriptName, Version)); Dictionary options; if (OptionsParser.TryParse(arguments, out options)) { // Required OptionsParser.ParseStandardOptions(options, out _serverUrl, out _indexName, out _threads, out _nodesPerRequest); OptionsParser.ParseQueryOption(options, out _query); if (!options.TryGetValue("--csv", out _csvFile)) throw new ArgumentException("The path to csv (--csv) is a required parameter"); if (!options.TryGetValue("--regex-column-name", out _regexColumn)) throw new ArgumentException("The column name for the regex is a required parameter"); if (!options.TryGetValue("--tag-column-name", out _tagColumnName)) throw new ArgumentException("The column name for the name of the match is a required parameter"); // Optional if (!options.TryGetValue("--search-columns", out _searchFields)) _searchFields = "fullText"; _searchFieldsList = _searchFields.Split(',').ToList(); _documentUpdater = new RunScriptDocumentUpdater( _serverUrl, _indexName, _searchFieldsList, _nodesPerRequest, _threads); } else { var message = "\n\nPlease review the following parameters:" + "\n" + "-q \tquery or path to queryfile" + "\n" + "-u \tindex server URL" + "\n" + "-i \tindex name" + "\n" + "--csv \tPath to the comma separated value file" + "\n" + "--regex-column-name \tcolumn that specifies the regular expression" + "\n" + "--tag-column-name \tcolumn that specifies the name to tag a matching record with" + "\n" + "--search-columns \tComma separated list of columns to search (Default: fullText)" + "\n" + "--threads (Default: 1)" + "\n" + "--nodes-per-request (Default: 100)"; _log.Error(message); Console.WriteLine(message); throw new ArgumentException("Failed to parse arguments"); } } public void Run() { Console.WriteLine("\n"); LoadCsv(); using (var progress = new CommandLineProgress()) { _documentUpdater.Progress = progress; IterateDocuments(progress); } } private void LoadCsv() { using (var streamReader = new StreamReader(_csvFile)) { using (var reader = new CsvReader(streamReader, false)) { reader.Configuration.HasHeaderRecord = true; _records = reader.GetRecords().ToList(); if (_records.Count == 0) { throw new Exception("There are no records in the csv"); } var header = _records[0] as IDictionary; if (header == null) { throw new Exception("There was an error reading the csv"); } if (!header.Keys.Contains(_regexColumn)) { throw new Exception("The regex column could not be found in the csv"); } if (!header.Keys.Contains(_tagColumnName)) { throw new Exception("The tag column could not be found in the csv"); } } } } private void IterateDocuments(CommandLineProgress commandLineProgress) { var counter = 0; var updatedDocuments = 0; _documentUpdater.Update(_query, document => { var id = document["id"]; Interlocked.Increment(ref counter); try { var update = TagDocument(ref document); if (update) { Interlocked.Increment(ref updatedDocuments); return document.ToObject>(); } } catch (Exception ex) { _log.Error(string.Format("Error processing {0}.{1}", id, ex.Message), ex); if (ex is FatalException) { throw new Exception("Execution stopped due to fatal exception."); } } return new Dictionary(); }); } public bool TagDocument(ref JObject document) { var update = false; for (var i = 0; i < _records.Count; i++) { var dict = _records[i] as IDictionary; if (dict != null) { var rowDetails = GetDictionaryValues(dict); var regex = rowDetails.Item1; var tagValue = rowDetails.Item2; if (regex != null && tagValue != null) { tagValue = RemoveSpecialCharacters(tagValue.Trim().ToLower()); foreach (var name in _searchFieldsList) { JToken field; if (document.TryGetValue(name, out field)) { JToken tagField; if (!document.TryGetValue(tagValue, out tagField) || !tagField.HasValues) tagField = new JArray(); var array = tagField as JArray; if (array != null) { var uniqueMatches = Regex.Matches( field.ToString(),regex) .OfType() .Select(m => m.Value.Trim()) .Distinct(); foreach (var match in uniqueMatches) { if (array.All(m => m.ToString() != match)) { array.Add(match); } update = true; } if (update) document[tagValue] = array; } } } } } } return update; } public Tuple GetDictionaryValues(IDictionary dictionary) { object regex; object tagValue; if (!dictionary.TryGetValue(_regexColumn, out regex)) { throw new FatalException(string.Format("The {0} column could not be found in {1}", _regexColumn, _csvFile)); } if (!dictionary.TryGetValue(_tagColumnName, out tagValue)) { throw new FatalException(string.Format("The {0} column could not be found in {1}", _tagColumnName, _csvFile)); } var regexString = regex as string; if (regexString == null) { throw new Exception("Skipping row because the regex pattern was empty."); } var tagStringValue = tagValue as string; if (tagStringValue == null) { throw new Exception("Skipping row because the tag value was empty."); } return new Tuple(regexString, tagStringValue); } public void TearDown() { } private static bool[] _lookup; public static void LoadLookup() { _lookup = new bool[1024]; for (var c = '0'; c <= '9'; c++) _lookup[c] = true; for (var c = 'A'; c <= 'Z'; c++) _lookup[c] = true; for (var c = 'a'; c <= 'z'; c++) _lookup[c] = true; _lookup['_'] = true; } public static string RemoveSpecialCharacters(string str) { if (_lookup == null) { LoadLookup(); } var buffer = new char[str.Length]; var index = 0; foreach (var c in str) { if (_lookup[c]) { buffer[index] = c; } else { buffer[index] = '-'; } index++; } return new string(buffer, 0, index); } }