21using System.Collections.Generic;
22using System.Runtime.CompilerServices;
23using System.Text.RegularExpressions;
30 public UserAgent userAgent {
get;
private set; }
31 public IPAddress ipAddress {
get;
private set; }
33 public string ip {
get;
set; }
34 public string ua {
get;
set; }
36 #region private Variables
37 private struct IdRegString
42 public string pattern;
46 private bool useCache;
52 private static List<IdRegString> clientRegstringList;
53 private static List<IdRegString> osRegstringList;
54 private static List<IdRegString> deviceRegstringList;
56 private readonly Dictionary<string, Regex> regexCache =
new Dictionary<string, Regex>();
57 private readonly Dictionary<string, string> preparedStmtMap =
new Dictionary<string, string>();
91 public UdgerParser(
bool useLRUCash =
true,
int LRUCashCapacity = 10000)
98 this.useCache = useLRUCash;
104 #region setParser method
111 if (!Directory.Exists(dataDir))
112 throw new Exception(
"Data dir not found");
114 dt.data_dir = dataDir;
115 dt.DataSourcePath = dataDir +
@"\udgerdb_v3.dat";
117 if (!File.Exists(dt.DataSourcePath))
118 throw new Exception(
"Data file udgerdb_v3.dat not found");
127 if (!Directory.Exists(dataDir))
128 throw new Exception(
"Data dir not found");
130 dt.data_dir = dataDir;
131 dt.DataSourcePath = dataDir +
@"\" + fileName;
133 if (!File.Exists(dt.DataSourcePath))
134 throw new Exception(
"Data file " + fileName +
" not found");
138 #region public method
154 if (useCache && cache.TryGetValue(
this.ua, out uaCache))
158 this.parseUA(this.ua.Replace(
"'",
"''"));
164 this.parseIP(this.ip.Replace(
"'",
"''"));
172 #region private method
175 private void parseUA(
string _userAgent)
178 int client_class_id = -1;
181 if (!
string.IsNullOrEmpty(_userAgent))
183 userAgent.UaString = this.ua;
184 userAgent.UaClass =
"Unrecognized";
185 userAgent.UaClassCode =
"unrecognized";
190 this.processClient(_userAgent, ref os_id, ref client_id, ref client_class_id);
192 this.processOS(_userAgent, ref os_id, client_id);
194 this.processDevice(_userAgent, ref client_class_id);
196 if (userAgent.OsFamilyCode !=
null && userAgent.OsFamilyCode !=
"" )
198 this.processDeviceBrand();
202 cache.Set(_userAgent, this.userAgent);
208 private void parseIP(
string _ip)
211 if (!
string.IsNullOrEmpty(_ip))
213 ipAddress.Ip = this.ip;
217 int ipVer = this.getIPAddressVersion(ip, out ipLoc);
225 DataTable ipTable = dt.selectQuery(
@"SELECT udger_crawler_list.id as botid,ip_last_seen,ip_hostname,ip_country,ip_city,ip_country_code,ip_classification,ip_classification_code,
226 name,ver,ver_major,last_seen,respect_robotstxt,family,family_code,family_homepage,family_icon,vendor,vendor_code,vendor_homepage,crawler_classification,crawler_classification_code,crawler_classification
228 JOIN udger_ip_class ON udger_ip_class.id=udger_ip_list.class_id
229 LEFT JOIN udger_crawler_list ON udger_crawler_list.id=udger_ip_list.crawler_id
230 LEFT JOIN udger_crawler_class ON udger_crawler_class.id=udger_crawler_list.class_id
231 WHERE ip=" +
'"' + _ip +
'"' +
" ORDER BY sequence");
233 if (ipTable !=
null && ipTable.Rows.Count > 0)
235 this.prepareIp(ipTable.Rows[0]);
239 long ipLong = this.AddrToInt(_ip);
241 DataTable dataCenter = dt.selectQuery(
@"select name, name_code, homepage
242 FROM udger_datacenter_range
243 JOIN udger_datacenter_list ON udger_datacenter_range.datacenter_id = udger_datacenter_list.id
244 where iplong_from <= " + ipLong.ToString() +
" AND iplong_to >=" + ipLong.ToString());
246 if (dataCenter !=
null && dataCenter.Rows.Count > 0)
248 this.prepareIpDataCenter(dataCenter.Rows[0]);
259 #region process methods
261 private void processOS(
string uaString, ref
int os_id,
int clientId)
264 int rowid = findIdFromList(uaString, osWordDetector.findWords(uaString), osRegstringList);
267 string q = String.Format(UdgerSqlQuery.SQL_OS, rowid);
268 DataTable opSysRs = dt.selectQuery(q);
269 this.prepareOs(opSysRs.Rows[0], ref os_id);
271 else if(clientId != 0)
273 DataTable opSysRs = dt.selectQuery(String.Format(UdgerSqlQuery.SQL_CLIENT_OS, clientId));
274 if (opSysRs !=
null && opSysRs.Rows.Count > 0)
276 this.prepareOs(opSysRs.Rows[0], ref os_id);
283 private void processClient(
string uaString, ref
int os_id, ref
int clientId, ref
int classId)
285 string q = String.Format(UdgerSqlQuery.SQL_CRAWLER, uaString);
286 DataTable userAgentRs = dt.selectQuery(q);
287 if (userAgentRs !=
null && userAgentRs.Rows.Count > 0 )
290 this.prepareUa(userAgentRs.Rows[0],
true, ref clientId, ref classId);
295 int rowid = this.findIdFromList(uaString, clientWordDetector.findWords(uaString), clientRegstringList);
298 userAgentRs = dt.selectQuery(String.Format(UdgerSqlQuery.SQL_CLIENT, rowid));
299 this.prepareUa(userAgentRs.Rows[0],
false, ref clientId, ref classId);
303 userAgent.UaClass =
"Unrecognized";
304 userAgent.UaClassCode =
"unrecognized";
309 private void processDevice(
string uaString, ref
int classId)
311 int rowid = this.findIdFromList(uaString, deviceWordDetector.findWords(uaString), deviceRegstringList);
314 DataTable devRs = dt.selectQuery(String.Format(UdgerSqlQuery.SQL_DEVICE, rowid));
315 this.prepareDevice(devRs.Rows[0], ref classId);
320 DataTable devRs = dt.selectQuery(String.Format(UdgerSqlQuery.SQL_CLIENT_CLASS, classId.ToString()));
321 if (devRs !=
null && devRs.Rows.Count > 0)
323 this.prepareDevice(devRs.Rows[0], ref classId);
329 private void processDeviceBrand()
331 System.Text.RegularExpressions.Regex reg;
332 PerlRegExpConverter regConv;
334 DataTable devRs = dt.selectQuery(String.Format(UdgerSqlQuery.SQL_DEVICE_REGEX,
this.userAgent.OsFamilyCode,
this.userAgent.OsCode));
335 if (devRs !=
null && devRs.Rows.Count > 0)
337 foreach (DataRow row
in devRs.Rows)
339 String devId =
UdgerParser.ConvertToStr(row[
"id"]);
340 String regex =
UdgerParser.ConvertToStr(row[
"regstring"]);
341 if (devId !=
null && regex !=
null)
343 regConv =
new PerlRegExpConverter(regex,
"", Encoding.UTF8);
345 if (reg.IsMatch(
this.ua))
347 string foo = reg.Match(this.ua).Groups[1].ToString();
348 DataTable devNameListRs = dt.selectQuery(String.Format(UdgerSqlQuery.SQL_DEVICE_NAME_LIST, devId, foo));
349 if (devNameListRs !=
null && devNameListRs.Rows.Count > 0)
351 DataRow r = devNameListRs.Rows[0];
352 userAgent.DeviceMarketname =
UdgerParser.ConvertToStr(r[
"marketname"]);
353 userAgent.DeviceBrand =
UdgerParser.ConvertToStr(r[
"brand"]);
354 userAgent.DeviceBrandCode =
UdgerParser.ConvertToStr(r[
"brand_code"]);
355 userAgent.DeviceBrandHomepage =
UdgerParser.ConvertToStr(r[
"brand_url"]);
356 userAgent.DeviceBrandIcon =
UdgerParser.ConvertToStr(r[
"icon"]);
357 userAgent.DeviceBrandIconBig =
UdgerParser.ConvertToStr(r[
"icon_big"]);
358 userAgent.DeviceBrandInfoUrl =
@"https://udger.com/resources/ua-list/devices-brand-detail?brand=" +
UdgerParser.ConvertToStr(r[
"brand_code"]);
368 #region prepare data methods
370 private void prepareUa(DataRow _row,Boolean crawler,ref
int clientId, ref
int classId)
372 System.Text.RegularExpressions.Regex searchTerm;
373 PerlRegExpConverter regConv;
376 userAgent.Ua =
UdgerParser.ConvertToStr(_row[
"ua"]);
377 userAgent.UaVersion =
UdgerParser.ConvertToStr(_row[
"ua_version"]);
378 userAgent.UaVersionMajor =
UdgerParser.ConvertToStr(_row[
"ua_version_major"]);
381 string pattern =
UdgerParser.ConvertToStr(_row[
"regstring"]);
384 regConv =
new PerlRegExpConverter(pattern,
"", Encoding.UTF8);
385 searchTerm = regConv.Regex;
386 if (searchTerm.IsMatch(
this.ua) && (group = searchTerm.Match(
this.ua).Groups[1]) !=
null)
390 userAgent.UaVersion =
UdgerParser.ConvertToStr(group);
391 userAgent.UaVersionMajor =
UdgerParser.ConvertToStr(group).Split(
'.')[0];
395 clientId =
UdgerParser.ConvertToInt(_row[
"client_id"]);
396 classId =
UdgerParser.ConvertToInt(_row[
"class_id"]);
397 userAgent.CrawlerCategory =
UdgerParser.ConvertToStr(_row[
"crawler_category"]);
398 userAgent.CrawlerCategoryCode =
UdgerParser.ConvertToStr(_row[
"crawler_category_code"]);
399 userAgent.CrawlerLastSeen =
UdgerParser.ConvertToStr(_row[
"crawler_last_seen"]);
400 userAgent.CrawlerRespectRobotstxt =
UdgerParser.ConvertToStr(_row[
"crawler_respect_robotstxt"]);
401 userAgent.UaString = this.ua;
402 userAgent.UaClass =
UdgerParser.ConvertToStr(_row[
"ua_class"]);
403 userAgent.UaClassCode =
UdgerParser.ConvertToStr(_row[
"ua_class_code"]);
404 userAgent.UaUptodateCurrentVersion =
UdgerParser.ConvertToStr(_row[
"ua_uptodate_current_version"]);
405 userAgent.UaFamily =
UdgerParser.ConvertToStr(_row[
"ua_family"]);
406 userAgent.UaFamilyCode =
UdgerParser.ConvertToStr(_row[
"ua_family_code"]);
407 userAgent.UaFamilyHompage =
UdgerParser.ConvertToStr(_row[
"ua_family_homepage"]);
408 userAgent.UaFamilyVendor =
UdgerParser.ConvertToStr(_row[
"ua_family_vendor"]);
409 userAgent.UaFamilyVendorCode =
UdgerParser.ConvertToStr(_row[
"ua_family_vendor_code"]);
410 userAgent.UaFamilyVendorHomepage =
UdgerParser.ConvertToStr(_row[
"ua_family_vendor_homepage"]);
411 userAgent.UaFamilyIcon =
UdgerParser.ConvertToStr(_row[
"ua_family_icon"]);
412 userAgent.UaFamilyIconBig =
UdgerParser.ConvertToStr(_row[
"ua_family_icon_big"]);
413 userAgent.UaFamilyInfoUrl =
UdgerParser.ConvertToStr(_row[
"ua_family_info_url"]);
414 userAgent.UaEngine =
UdgerParser.ConvertToStr(_row[
"ua_engine"]);
417 private void prepareOs(DataRow _row, ref
int _osId)
420 userAgent.Os =
UdgerParser.ConvertToStr(_row[
"os"]);
421 userAgent.OsCode =
UdgerParser.ConvertToStr(_row[
"os_code"]);
422 userAgent.OsHomepage =
UdgerParser.ConvertToStr(_row[
"os_home_page"]);
423 userAgent.OsIcon =
UdgerParser.ConvertToStr(_row[
"os_icon"]);
424 userAgent.OsIconBig =
UdgerParser.ConvertToStr(_row[
"os_icon_big"]);
425 userAgent.OsInfoUrl =
UdgerParser.ConvertToStr(_row[
"os_info_url"]);
426 userAgent.OsFamily =
UdgerParser.ConvertToStr(_row[
"os_family"]);
427 userAgent.OsFamilyCode =
UdgerParser.ConvertToStr(_row[
"os_family_code"]);
428 userAgent.OsFamilyVendor =
UdgerParser.ConvertToStr(_row[
"os_family_vendor"]);
429 userAgent.OsFamilyVendorCode =
UdgerParser.ConvertToStr(_row[
"os_family_vendor_code"]);
430 userAgent.OsFamilyVendorHomepage =
UdgerParser.ConvertToStr(_row[
"os_family_vedor_homepage"]);
434 private void prepareDevice(DataRow _row, ref
int _deviceClassId)
438 userAgent.DeviceClass =
UdgerParser.ConvertToStr(_row[
"device_class"]);
439 userAgent.DeviceClassCode =
UdgerParser.ConvertToStr(_row[
"device_class_code"]);
440 userAgent.DeviceClassIcon =
UdgerParser.ConvertToStr(_row[
"device_class_icon"]);
441 userAgent.DeviceClassIconBig =
UdgerParser.ConvertToStr(_row[
"device_class_icon_big"]);
442 userAgent.DeviceClassInfoUrl =
UdgerParser.ConvertToStr(_row[
"device_class_info_url"]);
446 private void prepareIp(DataRow _row)
448 ipAddress.IpClassification =
UdgerParser.ConvertToStr(_row[
"ip_classification"]);
449 ipAddress.IpClassificationCode =
UdgerParser.ConvertToStr(_row[
"ip_classification_code"]);
450 ipAddress.IpLastSeen =
UdgerParser.ConvertToStr(_row[
"ip_last_seen"]);
451 ipAddress.IpHostname =
UdgerParser.ConvertToStr(_row[
"ip_hostname"]);
452 ipAddress.IpCountry =
UdgerParser.ConvertToStr(_row[
"ip_country"]);
453 ipAddress.IpCountryCode =
UdgerParser.ConvertToStr(_row[
"ip_country_code"]);
454 ipAddress.IpCity =
UdgerParser.ConvertToStr(_row[
"ip_city"]);
455 ipAddress.CrawlerName =
UdgerParser.ConvertToStr(_row[
"name"]);
456 ipAddress.CrawlerVer =
UdgerParser.ConvertToStr(_row[
"ver"]);
457 ipAddress.CrawlerVerMajor =
UdgerParser.ConvertToStr(_row[
"ver_major"]);
458 ipAddress.CrawlerFamily =
UdgerParser.ConvertToStr(_row[
"family"]);
459 ipAddress.CrawlerFamilyCode =
UdgerParser.ConvertToStr(_row[
"family_code"]);
460 ipAddress.CrawlerFamilyHomepage =
UdgerParser.ConvertToStr(_row[
"family_homepage"]);
461 ipAddress.CrawlerFamilyVendor =
UdgerParser.ConvertToStr(_row[
"vendor"]);
462 ipAddress.CrawlerFamilyVendorCode =
UdgerParser.ConvertToStr(_row[
"vendor_code"]);
463 ipAddress.CrawlerFamilyVendorHomepage =
UdgerParser.ConvertToStr(_row[
"vendor_homepage"]);
464 ipAddress.CrawlerFamilyIcon =
UdgerParser.ConvertToStr(_row[
"family_icon"]);
465 ipAddress.CrawlerLastSeen =
UdgerParser.ConvertToStr(_row[
"last_seen"]);
466 ipAddress.CrawlerCategory =
UdgerParser.ConvertToStr(_row[
"crawler_classification"]);
467 ipAddress.CrawlerCategoryCode =
UdgerParser.ConvertToStr(_row[
"crawler_classification_code"]);
468 if (ipAddress.IpClassificationCode ==
"crawler")
469 ipAddress.CrawlerFamilyInfoUrl =
"https://udger.com/resources/ua-list/bot-detail?bot=" +
UdgerParser.ConvertToStr(_row[
"family"]) +
"#id" +
UdgerParser.ConvertToStr(_row[
"botid"]);
470 ipAddress.CrawlerRespectRobotstxt =
UdgerParser.ConvertToStr(_row[
"respect_robotstxt"]);
473 private void prepareIpDataCenter(DataRow _row)
475 ipAddress.DatacenterName =
UdgerParser.ConvertToStr(_row[
"name"]);
476 ipAddress.DatacenterNameCode =
UdgerParser.ConvertToStr(_row[
"name_code"]);
477 ipAddress.DatacenterHomepage =
UdgerParser.ConvertToStr(_row[
"homepage"]);
481 private static string ConvertToStr(
object value)
483 if (value ==
null || value.GetType() == typeof(DBNull))
485 return value.ToString();
488 private static int ConvertToInt(
object value)
490 if (value ==
null || value.GetType() == typeof(DBNull))
492 return Convert.ToInt32(value);
494 private static DateTime ConvertToDateTime(
string value)
497 DateTime.TryParse(value, out dt);
503 private int getIPAddressVersion(
string _ip, out
string _retIp)
505 System.Net.IPAddress addr;
508 if (System.Net.IPAddress.TryParse(_ip, out addr))
510 _retIp = addr.ToString();
511 if (addr.AddressFamily == System.Net.Sockets.AddressFamily.InterNetwork)
513 if (addr.AddressFamily == System.Net.Sockets.AddressFamily.InterNetworkV6)
520 private long AddrToInt(
string addr)
523 return (
long)(uint)System.Net.IPAddress.NetworkToHostOrder(
524 (
int)System.Net.IPAddress.Parse(addr).Address);
527 [MethodImpl(MethodImplOptions.Synchronized)]
528 private static void initStaticStructures(DataReader connection)
530 if (clientRegstringList ==
null) {
532 clientRegstringList = prepareRegexpStruct(connection,
"udger_client_regex");
533 osRegstringList = prepareRegexpStruct(connection,
"udger_os_regex");
534 deviceRegstringList = prepareRegexpStruct(connection,
"udger_deviceclass_regex");
536 clientWordDetector = createWordDetector(connection,
"udger_client_regex",
"udger_client_regex_words");
537 deviceWordDetector = createWordDetector(connection,
"udger_deviceclass_regex",
"udger_deviceclass_regex_words");
538 osWordDetector = createWordDetector(connection,
"udger_os_regex",
"udger_os_regex_words");
542 private static WordDetector createWordDetector(DataReader connection, String regexTableName, String wordTableName)
545 HashSet<int> usedWords =
new HashSet<int>();
547 addUsedWords(usedWords, connection, regexTableName,
"word_id");
548 addUsedWords(usedWords, connection, regexTableName,
"word2_id");
550 WordDetector result =
new WordDetector();
552 DataTable dt = connection.selectQuery(
"SELECT * FROM " + wordTableName);
555 foreach (DataRow row
in dt.Rows)
558 if (usedWords.Contains(
id))
560 String word =
UdgerParser.ConvertToStr(row[
"word"]).ToLower();
561 result.addWord(
id, word);
568 private static void addUsedWords(HashSet<int> usedWords, DataReader connection, String regexTableName, String wordIdColumn)
570 DataTable rs = connection.selectQuery(
"SELECT " + wordIdColumn +
" FROM " + regexTableName);
573 foreach (DataRow row
in rs.Rows)
575 usedWords.Add(
UdgerParser.ConvertToInt(row[wordIdColumn]));
580 private int findIdFromList(String uaString, HashSet<int> foundClientWords, List<IdRegString> list)
582 System.Text.RegularExpressions.Regex searchTerm;
583 PerlRegExpConverter regConv;
585 foreach (IdRegString irs
in list)
587 if ((irs.wordId1 == 0 || foundClientWords.Contains(irs.wordId1)) &&
588 (irs.wordId2 == 0 || foundClientWords.Contains(irs.wordId2)))
590 regConv =
new PerlRegExpConverter(irs.pattern,
"", Encoding.UTF8);
591 searchTerm = regConv.Regex;
592 if (searchTerm.IsMatch(uaString))
602 private static List<IdRegString> prepareRegexpStruct(DataReader connection, String regexpTableName)
604 List<IdRegString> ret =
new List<IdRegString>();
605 DataTable rs = connection.selectQuery(
"SELECT rowid, regstring, word_id, word2_id FROM " + regexpTableName +
" ORDER BY sequence");
608 foreach (DataRow row
in rs.Rows)
610 IdRegString irs =
new IdRegString();
612 irs.wordId1 =
UdgerParser.ConvertToInt(row[
"word_id"]);
613 irs.wordId2 =
UdgerParser.ConvertToInt(row[
"word2_id"]);
614 String regex =
UdgerParser.ConvertToStr(row[
"regstring"]);
616 Regex reg =
new Regex(
@"^/?(.*?)/si$");
617 if (reg.IsMatch(regex))
619 regex = reg.Match(regex).Groups[0].ToString();
void SetDataDir(string dataDir, string fileName)
Set the data directory and DB filename
void SetDataDir(string dataDir)
Set the data directory
void parse()
Parse the useragent string and/or ip address ///
UdgerParser(int LRUCashCapacity=10000)
Constructor
UdgerParser(bool useLRUCash=true, int LRUCashCapacity=10000)
Constructor