WordDetector.java
1package org.udger.parser;
2
3import java.io.Serializable;
4import java.util.ArrayList;
5import java.util.HashSet;
6import java.util.List;
7import java.util.Set;
8import java.util.logging.Logger;
9
10public class WordDetector implements Serializable {
11
12 private static final long serialVersionUID = -2123898245391386812L;
13
14 private static final Logger LOG = Logger.getLogger(WordDetector.class.getName());
15
16 private static class WordInfo {
17 int id;
18 String word;
19
20 public WordInfo(int id, String word) {
21 this.id = id;
22 this.word = word;
23 }
24 }
25
26 private static final int ARRAY_DIMENSION = 'z' - 'a';
27 private static final int ARRAY_SIZE = (ARRAY_DIMENSION + 1) * (ARRAY_DIMENSION + 1);
28
29 private List<WordInfo> wordArray[];
30 private int minWordSize = Integer.MAX_VALUE;
31
32 public WordDetector() {
33 wordArray = new List[ARRAY_SIZE];
34 }
35
36 public void addWord(int id, String word) {
37
38 if (word.length() < minWordSize) {
39 minWordSize = word.length();
40 }
41
42 String s = word.toLowerCase();
43 int index = (s.charAt(0) - 'a') * ARRAY_DIMENSION + s.charAt(1) - 'a';
44 if (index >= 0 && index < ARRAY_SIZE) {
45 List<WordInfo> wList = wordArray[index];
46 if (wList == null) {
47 wList = new ArrayList<>();
48 wordArray[index] = wList;
49 }
50 wList.add(new WordInfo(id, s));
51 } else {
52 LOG.warning("Index out of hashmap" + id + " : "+ s);
53 }
54 }
55
56 public Set<Integer> findWords(String text) {
57
58 Set<Integer> ret = new HashSet<>();
59
60 final String s = text.toLowerCase();
61 final int dimension = 'z' - 'a';
62 for(int i=0; i < s.length() - (minWordSize - 1); i++) {
63 final char c1 = s.charAt(i);
64 final char c2 = s.charAt(i + 1);
65 if (c1 >= 'a' && c1 <= 'z' && c2 >= 'a' && c2 <= 'z') {
66 final int index = (c1 - 'a') * dimension + c2 - 'a';
67 List<WordInfo> l = wordArray[index];
68 if (l != null) {
69 for (WordInfo wi : l) {
70 if (s.startsWith(wi.word, i)) {
71 ret.add(wi.id);
72 }
73 }
74 }
75 }
76 }
77 return ret;
78 }
79
80}