WordDetector.cs
1/*
2 UdgerParser - Local parser lib
3
4 UdgerParser class parses useragent strings based on a database downloaded from udger.com
5
6
7 author The Udger.com Team (info@udger.com)
8 copyright Copyright (c) Udger s.r.o.
9 license GNU Lesser General Public License
10 link https://udger.com/products/local_parser
11 */
12
13using System;
14using System.Collections.Generic;
15
16namespace Udger.Parser
17{
19 {
20
21 struct WordInfo
22 {
23 public int id { get; }
24 public String word { get; }
25
26 public WordInfo(int id, String word)
27 {
28 this.id = id;
29 this.word = word;
30 }
31 }
32
33 private static readonly int ARRAY_DIMENSION = 'z' - 'a';
34 private static readonly int ARRAY_SIZE = (ARRAY_DIMENSION + 1) * (ARRAY_DIMENSION + 1);
35
36 private List<WordInfo>[] wordArray;
37 private int minWordSize = Int32.MaxValue;
38
39 public WordDetector()
40 {
41 wordArray = new List<WordInfo>[ARRAY_SIZE];
42 }
43
44 public void addWord(int id, String word)
45 {
46
47 if (word.Length < minWordSize)
48 {
49 minWordSize = word.Length;
50 }
51
52 String s = word.ToLower();
53 int index = (s[0] - 'a') * ARRAY_DIMENSION + s[1] - 'a';
54 if (index >= 0 && index < ARRAY_SIZE)
55 {
56 List<WordInfo> wList = wordArray[index];
57 if (wList == null)
58 {
59 wList = new List<WordInfo>();
60 wordArray[index] = wList;
61 }
62 wList.Add(new WordInfo(id, s));
63 }
64 }
65
66 public HashSet<int> findWords(String text)
67 {
68
69 HashSet<int> ret = new HashSet<int>();
70
71 String s = text.ToLower();
72 int dimension = 'z' - 'a';
73 for (int i = 0; i < s.Length - (minWordSize - 1); i++)
74 {
75 char c1 = s[i];
76 char c2 = s[i + 1];
77 if (c1 >= 'a' && c1 <= 'z' && c2 >= 'a' && c2 <= 'z')
78 {
79 int index = (c1 - 'a') * dimension + c2 - 'a';
80 List<WordInfo> l = wordArray[index];
81 if (l != null)
82 {
83 foreach (WordInfo wi in l)
84 {
85 if (s.Substring(i).StartsWith(wi.word))
86 {
87 ret.Add(wi.id);
88 }
89 }
90 }
91 }
92 }
93 return ret;
94 }
95
96 }
97}
98