9package org.udger.parser;
11import org.sqlite.SQLiteConfig;
13import java.io.Closeable;
15import java.io.IOException;
16import java.lang.ref.SoftReference;
17import java.net.Inet4Address;
18import java.net.Inet6Address;
19import java.net.InetAddress;
20import java.net.UnknownHostException;
23import java.util.logging.Logger;
24import java.util.regex.Matcher;
25import java.util.regex.Pattern;
32 private static final Logger LOG = Logger.getLogger(
UdgerParser.class.getName());
34 private static final String DB_FILENAME =
"udgerdb_v3.dat";
35 private static final String UDGER_UA_DEV_BRAND_LIST_URL =
"https://udger.com/resources/ua-list/devices-brand-detail?brand=";
36 private static final String ID_CRAWLER =
"crawler";
37 private static final Pattern PAT_UNPERLIZE = Pattern.compile(
"^/?(.*?)/si$");
42 public static class ParserDbData {
48 private List<IdRegString> clientRegstringList;
49 private List<IdRegString> osRegstringList;
50 private List<IdRegString> deviceRegstringList;
52 private volatile boolean prepared =
false;
54 private final String dbFileName;
56 public ParserDbData(String dbFileName) {
57 this.dbFileName = dbFileName;
60 protected void prepare(Connection connection)
throws SQLException {
64 clientRegstringList = prepareRegexpStruct(connection,
"udger_client_regex");
65 osRegstringList = prepareRegexpStruct(connection,
"udger_os_regex");
66 deviceRegstringList = prepareRegexpStruct(connection,
"udger_deviceclass_regex");
68 clientWordDetector = createWordDetector(connection,
"udger_client_regex",
"udger_client_regex_words");
69 deviceWordDetector = createWordDetector(connection,
"udger_deviceclass_regex",
"udger_deviceclass_regex_words");
70 osWordDetector = createWordDetector(connection,
"udger_os_regex",
"udger_os_regex_words");
79 private static class ClientInfo {
80 private Integer clientId;
81 private Integer classId;
84 private static class IdRegString {
91 private static class MatcherWithIdRegString {
92 private final Matcher matcher;
93 private final IdRegString irs;
95 private MatcherWithIdRegString(Matcher matcher, IdRegString irs) {
96 this.matcher = matcher;
101 private ParserDbData parserDbData;
103 private Connection connection;
105 private final Map<String, SoftReference<Pattern>> regexCache =
new HashMap<>();
107 private Map<String, PreparedStatement> preparedStmtMap =
new HashMap<>();
111 private boolean osParserEnabled =
true;
112 private boolean deviceParserEnabled =
true;
113 private boolean deviceBrandParserEnabled =
true;
114 private boolean inMemoryEnabled =
false;
122 this(parserDbData, 10000);
131 public UdgerParser(ParserDbData parserDbData,
int cacheCapacity) {
132 this.parserDbData = parserDbData;
133 if (cacheCapacity > 0) {
145 public UdgerParser(ParserDbData parserDbData,
boolean inMemoryEnabled,
int cacheCapacity) {
146 this(parserDbData, cacheCapacity);
147 this.inMemoryEnabled = inMemoryEnabled;
151 public void close() throws IOException {
153 for (PreparedStatement preparedStmt : preparedStmtMap.values()) {
154 preparedStmt.close();
156 preparedStmtMap.clear();
157 if (connection !=
null && !connection.isClosed()) {
165 }
catch (SQLException e) {
166 throw new IOException(e.getMessage());
177 public boolean isValid(
int timeoutMillis)
throws IOException {
179 return connection ==
null || connection.isValid(timeoutMillis);
180 }
catch (SQLException e) {
181 throw new IOException(
"Failed to validate connection within " + timeoutMillis +
" millis.", e);
200 ret = cache.get(uaString);
210 ClientInfo clientInfo = clientDetector(uaString, ret);
212 if (!
"Crawler".equals(ret.getUaClass())) {
213 if (osParserEnabled) {
214 osDetector(uaString, ret, clientInfo);
217 if (deviceParserEnabled) {
218 deviceDetector(uaString, ret, clientInfo);
221 if (deviceBrandParserEnabled) {
222 if (ret.getOsFamilyCode() !=
null && !ret.getOsFamilyCode().isEmpty()) {
223 fetchDeviceBrand(uaString, ret);
229 cache.put(uaString, ret);
247 InetAddress addr = InetAddress.getByName(ipString);
249 String normalizedIp =
null;
251 if (addr instanceof Inet4Address) {
253 for (
byte b : addr.getAddress()) {
254 ipv4int = ipv4int << 8 | (b & 0xFF);
256 normalizedIp = addr.getHostAddress();
257 }
else if (addr instanceof Inet6Address) {
258 normalizedIp = addr.getHostAddress().replaceAll(
"((?:(?:^|:)0+\\b){2,}):?(?!\\S*\\b\\1:0+\\b)(\\S*)",
"::$2");
261 ret.setIpClassification(
"Unrecognized");
262 ret.setIpClassificationCode(
"unrecognized");
264 if (normalizedIp !=
null) {
268 try (ResultSet ipRs = getFirstRow(
UdgerSqlQuery.SQL_IP, normalizedIp)) {
269 if (ipRs !=
null && ipRs.next()) {
270 fetchUdgerIp(ipRs, ret);
271 if (!ID_CRAWLER.equals(ret.getIpClassificationCode())) {
272 ret.setCrawlerFamilyInfoUrl(
"");
277 if (ipv4int !=
null) {
279 ResultSet dataCenterRs = getFirstRow(
UdgerSqlQuery.SQL_DATACENTER, ipv4int, ipv4int);
280 fetchDataCenterAndCloseRs(dataCenterRs, ret);
283 int[] ipArray = ip6ToArray((Inet6Address) addr);
284 ResultSet dataCenterRs = getFirstRow(
UdgerSqlQuery.SQL_DATACENTER_RANGE6,
285 ipArray[0], ipArray[0],
286 ipArray[1], ipArray[1],
287 ipArray[2], ipArray[2],
288 ipArray[3], ipArray[3],
289 ipArray[4], ipArray[4],
290 ipArray[5], ipArray[5],
291 ipArray[6], ipArray[6],
292 ipArray[7], ipArray[7]
294 fetchDataCenterAndCloseRs(dataCenterRs, ret);
301 private void fetchDataCenterAndCloseRs(ResultSet dataCenterRs,
UdgerIpResult ret)
throws SQLException {
302 if (dataCenterRs !=
null) {
304 if (dataCenterRs.next()) {
305 fetchDataCenter(dataCenterRs, ret);
308 dataCenterRs.close();
319 return osParserEnabled;
336 this.osParserEnabled = osParserEnabled;
345 return deviceParserEnabled;
362 this.deviceParserEnabled = deviceParserEnabled;
371 return deviceBrandParserEnabled;
388 this.deviceBrandParserEnabled = deviceBrandParserEnabled;
391 private static WordDetector createWordDetector(Connection connection, String regexTableName, String wordTableName)
throws SQLException {
393 Set<Integer> usedWords =
new HashSet<>();
395 addUsedWords(usedWords, connection, regexTableName,
"word_id");
396 addUsedWords(usedWords, connection, regexTableName,
"word2_id");
400 try (
final Statement statement = connection.createStatement();
401 final ResultSet rs = statement.executeQuery(
"SELECT * FROM " + wordTableName)) {
404 int id = rs.getInt(
"id");
405 if (usedWords.contains(
id)) {
406 String word = rs.getString(
"word").toLowerCase();
407 result.addWord(
id, word);
415 private static void addUsedWords(Set<Integer> usedWords, Connection connection, String regexTableName, String wordIdColumn)
throws SQLException {
416 try (Statement statement = connection.createStatement();
417 ResultSet rs = statement.executeQuery(
"SELECT " + wordIdColumn +
" FROM " + regexTableName)) {
420 usedWords.add(rs.getInt(wordIdColumn));
426 private MatcherWithIdRegString findMatcherIdRegString(String uaString, Set<Integer> foundClientWords, List<IdRegString> list) {
427 for (IdRegString irs : list) {
428 if ((irs.wordId1 == 0 || foundClientWords.contains(irs.wordId1)) &&
429 (irs.wordId2 == 0 || foundClientWords.contains(irs.wordId2))) {
430 Matcher matcher = irs.pattern.matcher(uaString);
432 return new MatcherWithIdRegString(matcher, irs);
438 private static List<IdRegString> prepareRegexpStruct(Connection connection, String regexpTableName)
throws SQLException {
439 List<IdRegString> ret =
new ArrayList<>();
440 try (Statement statement = connection.createStatement();
441 ResultSet rs = statement.executeQuery(
"SELECT rowid, regstring, word_id, word2_id FROM " + regexpTableName +
" ORDER BY sequence")) {
444 IdRegString irs =
new IdRegString();
445 irs.id = rs.getInt(
"rowid");
446 irs.wordId1 = rs.getInt(
"word_id");
447 irs.wordId2 = rs.getInt(
"word2_id");
448 String regex = rs.getString(
"regstring");
449 Matcher m = PAT_UNPERLIZE.matcher(regex);
453 irs.pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
461 private ClientInfo clientDetector(String uaString, UdgerUaResult ret)
throws SQLException {
462 ClientInfo clientInfo =
new ClientInfo();
463 try (ResultSet userAgentRs1 = getFirstRow(UdgerSqlQuery.SQL_CRAWLER, uaString)) {
464 if (userAgentRs1 !=
null && userAgentRs1.next()) {
465 fetchUserAgent(userAgentRs1, ret);
466 clientInfo.classId = 99;
467 clientInfo.clientId = -1;
469 MatcherWithIdRegString mwirs = findMatcherIdRegString(uaString, parserDbData.clientWordDetector.findWords(uaString), parserDbData.clientRegstringList);
471 try (ResultSet userAgentRs2 = getFirstRow(UdgerSqlQuery.SQL_CLIENT, mwirs.irs.id)) {
472 if (userAgentRs2 !=
null && userAgentRs2.next()) {
473 fetchUserAgent(userAgentRs2, ret);
474 clientInfo.classId = ret.getClassId();
475 clientInfo.clientId = ret.getClientId();
476 patchVersions(mwirs.matcher, ret);
480 ret.setUaClass(
"Unrecognized");
481 ret.setUaClassCode(
"unrecognized");
488 private void osDetector(String uaString, UdgerUaResult ret, ClientInfo clientInfo)
throws SQLException {
489 MatcherWithIdRegString mwirs = findMatcherIdRegString(uaString, parserDbData.osWordDetector.findWords(uaString), parserDbData.osRegstringList);
491 try (ResultSet opSysRs = getFirstRow(UdgerSqlQuery.SQL_OS, mwirs.irs.id)) {
492 if (opSysRs !=
null && opSysRs.next()) {
493 fetchOperatingSystem(opSysRs, ret);
497 if (clientInfo.clientId !=
null && clientInfo.clientId != 0) {
498 try (ResultSet opSysRs = getFirstRow(UdgerSqlQuery.SQL_CLIENT_OS, clientInfo.clientId.toString())) {
499 if (opSysRs !=
null && opSysRs.next()) {
500 fetchOperatingSystem(opSysRs, ret);
507 private void deviceDetector(String uaString, UdgerUaResult ret, ClientInfo clientInfo)
throws SQLException {
508 MatcherWithIdRegString mwirs = findMatcherIdRegString(uaString, parserDbData.deviceWordDetector.findWords(uaString), parserDbData.deviceRegstringList);
510 try (ResultSet devRs = getFirstRow(UdgerSqlQuery.SQL_DEVICE, mwirs.irs.id)) {
511 if (devRs !=
null && devRs.next()) {
512 fetchDevice(devRs, ret);
516 if (clientInfo.classId !=
null && clientInfo.classId != -1) {
517 try (ResultSet devRs = getFirstRow(UdgerSqlQuery.SQL_CLIENT_CLASS, clientInfo.classId.toString())) {
518 if (devRs !=
null && devRs.next()) {
519 fetchDevice(devRs, ret);
526 private void fetchDeviceBrand(String uaString, UdgerUaResult ret)
throws SQLException {
527 PreparedStatement preparedStatement = preparedStmtMap.get(UdgerSqlQuery.SQL_DEVICE_REGEX);
528 if (preparedStatement ==
null) {
529 preparedStatement = connection.prepareStatement(UdgerSqlQuery.SQL_DEVICE_REGEX);
530 preparedStmtMap.put(UdgerSqlQuery.SQL_DEVICE_REGEX, preparedStatement);
532 preparedStatement.setObject(1, ret.getOsFamilyCode());
533 preparedStatement.setObject(2, ret.getOsCode());
534 try (ResultSet devRegexRs = preparedStatement.executeQuery()) {
535 if (devRegexRs !=
null) {
536 while (devRegexRs.next()) {
537 String devId = devRegexRs.getString(
"id");
538 String regex = devRegexRs.getString(
"regstring");
539 if (devId !=
null && regex !=
null) {
540 Pattern patRegex = getRegexFromCache(regex);
541 Matcher matcher = patRegex.matcher(uaString);
542 if (matcher.find()) {
543 try (ResultSet devNameListRs = getFirstRow(UdgerSqlQuery.SQL_DEVICE_NAME_LIST, devId, matcher.group(1))) {
544 if (devNameListRs !=
null && devNameListRs.next()) {
545 ret.setDeviceMarketname(devNameListRs.getString(
"marketname"));
546 ret.setDeviceBrand(devNameListRs.getString(
"brand"));
547 ret.setDeviceBrandCode(devNameListRs.getString(
"brand_code"));
548 ret.setDeviceBrandHomepage(devNameListRs.getString(
"brand_url"));
549 ret.setDeviceBrandIcon(devNameListRs.getString(
"icon"));
550 ret.setDeviceBrandIconBig(devNameListRs.getString(
"icon_big"));
551 ret.setDeviceBrandInfoUrl(UDGER_UA_DEV_BRAND_LIST_URL + devNameListRs.getString(
"brand_code"));
563 private int[] ip6ToArray(Inet6Address addr) {
564 int ret[] =
new int[8];
565 byte[] bytes = addr.getAddress();
566 for (
int i = 0; i < 8; i++) {
567 ret[i] = ((bytes[i * 2] << 8) & 0xff00) | (bytes[i * 2 + 1] & 0xff);
572 private void prepare() throws SQLException {
574 parserDbData.prepare(connection);
577 private void connect() throws SQLException {
578 if (connection ==
null) {
579 SQLiteConfig config =
new SQLiteConfig();
580 config.setReadOnly(
true);
581 if (inMemoryEnabled) {
583 connection = DriverManager.getConnection(
"jdbc:sqlite::memory:");
584 File dbfile =
new File(parserDbData.dbFileName);
585 try (Statement statement = connection.createStatement()) {
586 statement.executeUpdate(
"restore from " + dbfile.getPath());
587 }
catch (Exception e) {
588 LOG.warning(
"Error re-constructing in memory data base from Db file " + dbfile);
591 connection = DriverManager.getConnection(
"jdbc:sqlite:" + parserDbData.dbFileName, config.toProperties());
596 private Pattern getRegexFromCache(String regex) {
597 SoftReference<Pattern> patRegex = regexCache.get(regex);
598 if (patRegex ==
null || patRegex.get() ==
null) {
599 Matcher m = PAT_UNPERLIZE.matcher(regex);
603 patRegex =
new SoftReference<>(Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.DOTALL));
604 regexCache.put(regex, patRegex);
606 return patRegex.get();
609 private ResultSet getFirstRow(String query, Object... params)
throws SQLException {
610 PreparedStatement preparedStatement = preparedStmtMap.get(query);
611 if (preparedStatement ==
null) {
612 preparedStatement = connection.prepareStatement(query);
613 preparedStmtMap.put(query, preparedStatement);
615 for (
int i = 0; i < params.length; i++) {
616 preparedStatement.setObject(i + 1, params[i]);
618 preparedStatement.setMaxRows(1);
619 return preparedStatement.executeQuery();
623 private void fetchUserAgent(ResultSet rs, UdgerUaResult ret)
throws SQLException {
624 ret.setClassId(rs.getInt(
"class_id"));
625 ret.setClientId(rs.getInt(
"client_id"));
626 ret.setCrawlerCategory(nvl(rs.getString(
"crawler_category")));
627 ret.setCrawlerCategoryCode(nvl(rs.getString(
"crawler_category_code")));
628 ret.setCrawlerLastSeen(nvl(rs.getString(
"crawler_last_seen")));
629 ret.setCrawlerRespectRobotstxt(nvl(rs.getString(
"crawler_respect_robotstxt")));
630 ret.setUa(nvl(rs.getString(
"ua")));
631 ret.setUaClass(nvl(rs.getString(
"ua_class")));
632 ret.setUaClassCode(nvl(rs.getString(
"ua_class_code")));
633 ret.setUaEngine(nvl(rs.getString(
"ua_engine")));
634 ret.setUaFamily(nvl(rs.getString(
"ua_family")));
635 ret.setUaFamilyCode(nvl(rs.getString(
"ua_family_code")));
636 ret.setUaFamilyHomepage(nvl(rs.getString(
"ua_family_homepage")));
637 ret.setUaFamilyIcon(nvl(rs.getString(
"ua_family_icon")));
638 ret.setUaFamilyIconBig(nvl(rs.getString(
"ua_family_icon_big")));
639 ret.setUaFamilyInfoUrl(nvl(rs.getString(
"ua_family_info_url")));
640 ret.setUaFamilyVendor(nvl(rs.getString(
"ua_family_vendor")));
641 ret.setUaFamilyVendorCode(nvl(rs.getString(
"ua_family_vendor_code")));
642 ret.setUaFamilyVendorHomepage(nvl(rs.getString(
"ua_family_vendor_homepage")));
643 ret.setUaUptodateCurrentVersion(nvl(rs.getString(
"ua_uptodate_current_version")));
644 ret.setUaVersion(nvl(rs.getString(
"ua_version")));
645 ret.setUaVersionMajor(nvl(rs.getString(
"ua_version_major")));
648 private void fetchOperatingSystem(ResultSet rs, UdgerUaResult ret)
throws SQLException {
649 ret.setOsFamily(nvl(rs.getString(
"os_family")));
650 ret.setOs(nvl(rs.getString(
"os")));
651 ret.setOsCode(nvl(rs.getString(
"os_code")));
652 ret.setOsFamilyCode(nvl(rs.getString(
"os_family_code")));
653 ret.setOsFamilyVendorHomepage(nvl(rs.getString(
"os_family_vendor_homepage")));
654 ret.setOsFamilyVendor(nvl(rs.getString(
"os_family_vendor")));
655 ret.setOsFamilyVendorCode(nvl(rs.getString(
"os_family_vendor_code")));
656 ret.setOsHomePage(nvl(rs.getString(
"os_home_page")));
657 ret.setOsIcon(nvl(rs.getString(
"os_icon")));
658 ret.setOsIconBig(nvl(rs.getString(
"os_icon_big")));
659 ret.setOsInfoUrl(nvl(rs.getString(
"os_info_url")));
662 private void fetchDevice(ResultSet rs, UdgerUaResult ret)
throws SQLException {
663 ret.setDeviceClass(nvl(rs.getString(
"device_class")));
664 ret.setDeviceClassCode(nvl(rs.getString(
"device_class_code")));
665 ret.setDeviceClassIcon(nvl(rs.getString(
"device_class_icon")));
666 ret.setDeviceClassIconBig(nvl(rs.getString(
"device_class_icon_big")));
667 ret.setDeviceClassInfoUrl(nvl(rs.getString(
"device_class_info_url")));
670 private void patchVersions(Matcher lastPatternMatcher, UdgerUaResult ret) {
671 if (lastPatternMatcher !=
null) {
673 if (lastPatternMatcher.groupCount() >= 1) {
674 version = lastPatternMatcher.group(1);
675 if (version ==
null) {
679 ret.setUaVersion(version);
680 String versionSegments[] = version.split(
"\\.");
681 if (versionSegments.length > 0) {
682 ret.setUaVersionMajor(version.split(
"\\.")[0]);
684 ret.setUaVersionMajor(
"");
686 ret.setUa((ret.getUa() !=
null ? ret.getUa() :
"") +
" " + version);
688 ret.setUaVersion(
"");
689 ret.setUaVersionMajor(
"");
693 private void fetchUdgerIp(ResultSet rs, UdgerIpResult ret)
throws SQLException {
694 ret.setCrawlerCategory(nvl(rs.getString(
"crawler_category")));
695 ret.setCrawlerCategoryCode(nvl(rs.getString(
"crawler_category_code")));
696 ret.setCrawlerFamily(nvl(rs.getString(
"crawler_family")));
697 ret.setCrawlerFamilyCode(nvl(rs.getString(
"crawler_family_code")));
698 ret.setCrawlerFamilyHomepage(nvl(rs.getString(
"crawler_family_homepage")));
699 ret.setCrawlerFamilyIcon(nvl(rs.getString(
"crawler_family_icon")));
700 ret.setCrawlerFamilyInfoUrl(nvl(rs.getString(
"crawler_family_info_url")));
701 ret.setCrawlerFamilyVendor(nvl(rs.getString(
"crawler_family_vendor")));
702 ret.setCrawlerFamilyVendorCode(nvl(rs.getString(
"crawler_family_vendor_code")));
703 ret.setCrawlerFamilyVendorHomepage(nvl(rs.getString(
"crawler_family_vendor_homepage")));
704 ret.setCrawlerLastSeen(nvl(rs.getString(
"crawler_last_seen")));
705 ret.setCrawlerName(nvl(rs.getString(
"crawler_name")));
706 ret.setCrawlerRespectRobotstxt(nvl(rs.getString(
"crawler_respect_robotstxt")));
707 ret.setCrawlerVer(nvl(rs.getString(
"crawler_ver")));
708 ret.setCrawlerVerMajor(nvl(rs.getString(
"crawler_ver_major")));
709 ret.setIpCity(nvl(rs.getString(
"ip_city")));
710 ret.setIpClassification(nvl(rs.getString(
"ip_classification")));
711 ret.setIpClassificationCode(nvl(rs.getString(
"ip_classification_code")));
712 ret.setIpCountry(nvl(rs.getString(
"ip_country")));
713 ret.setIpCountryCode(nvl(rs.getString(
"ip_country_code")));
714 ret.setIpHostname(nvl(rs.getString(
"ip_hostname")));
715 ret.setIpLastSeen(nvl(rs.getString(
"ip_last_seen")));
718 private String nvl(String v) {
719 return v !=
null ? v :
"";
722 private void fetchDataCenter(ResultSet rs, UdgerIpResult ret)
throws SQLException {
723 ret.setDataCenterHomePage(nvl(rs.getString(
"datacenter_homepage")));
724 ret.setDataCenterName(nvl(rs.getString(
"datacenter_name")));
725 ret.setDataCenterNameCode(nvl(rs.getString(
"datacenter_name_code")));
void setDeviceParserEnabled(boolean deviceParserEnabled)
UdgerParser(ParserDbData parserDbData, int cacheCapacity)
boolean isOsParserEnabled()
boolean isDeviceParserEnabled()
boolean isDeviceBrandParserEnabled()
UdgerIpResult parseIp(String ipString)
boolean isValid(int timeoutMillis)
UdgerUaResult parseUa(String uaString)
void setOsParserEnabled(boolean osParserEnabled)
void setDeviceBrandParserEnabled(boolean deviceBrandParserEnabled)
UdgerParser(ParserDbData parserDbData, boolean inMemoryEnabled, int cacheCapacity)
UdgerParser(ParserDbData parserDbData)