1package org.udger.parser;
3import java.io.Serializable;
4import java.util.ArrayList;
5import java.util.HashSet;
8import java.util.logging.Logger;
12 private static final long serialVersionUID = -2123898245391386812L;
14 private static final Logger LOG = Logger.getLogger(
WordDetector.class.getName());
16 private static class WordInfo {
20 public WordInfo(
int id, String word) {
26 private static final int ARRAY_DIMENSION =
'z' -
'a';
27 private static final int ARRAY_SIZE = (ARRAY_DIMENSION + 1) * (ARRAY_DIMENSION + 1);
29 private List<WordInfo> wordArray[];
30 private int minWordSize = Integer.MAX_VALUE;
33 wordArray =
new List[ARRAY_SIZE];
36 public void addWord(
int id, String word) {
38 if (word.length() < minWordSize) {
39 minWordSize = word.length();
42 String s = word.toLowerCase();
43 int index = (s.charAt(0) -
'a') * ARRAY_DIMENSION + s.charAt(1) -
'a';
44 if (index >= 0 && index < ARRAY_SIZE) {
45 List<WordInfo> wList = wordArray[index];
47 wList =
new ArrayList<>();
48 wordArray[index] = wList;
50 wList.add(
new WordInfo(
id, s));
52 LOG.warning(
"Index out of hashmap" +
id +
" : "+ s);
56 public Set<Integer> findWords(String text) {
58 Set<Integer> ret =
new HashSet<>();
60 final String s = text.toLowerCase();
61 final int dimension =
'z' -
'a';
62 for(
int i=0; i < s.length() - (minWordSize - 1); i++) {
63 final char c1 = s.charAt(i);
64 final char c2 = s.charAt(i + 1);
65 if (c1 >=
'a' && c1 <= 'z' && c2 >=
'a' && c2 <=
'z') {
66 final int index = (c1 -
'a') * dimension + c2 -
'a';
67 List<WordInfo> l = wordArray[index];
69 for (WordInfo wi : l) {
70 if (s.startsWith(wi.word, i)) {