diff options
| author | 张硕 <[email protected]> | 2019-12-31 11:10:43 +0800 |
|---|---|---|
| committer | 张硕 <[email protected]> | 2019-12-31 11:10:43 +0800 |
| commit | 6fe2e67322cbbbc849c68b930d1b1c1a00b27aec (patch) | |
| tree | fbce9eaa0f01091b19430145ac4de89afcaaf4a7 | |
| parent | d5d15da57ed85936cbfe2525d03d7bd195709b18 (diff) | |
Update Data.h, DomainDeal, DomainDeal.cpp files
| -rw-r--r-- | Data.h | 110 | ||||
| -rw-r--r-- | DomainDeal | bin | 0 -> 824840 bytes | |||
| -rw-r--r-- | DomainDeal.cpp | 101 |
3 files changed, 211 insertions, 0 deletions
@@ -0,0 +1,110 @@ +#pragma once + + +#define LIB_PATH "lib/" +#define DATA_PATH "data/" +#define CDN_FILE LIB_PATH "CdnDomainList.dat" +#define URL_FILE LIB_PATH "UrlDomainList.dat" + +#define SPCDN_NAME "spcdn" +#define OTHER_NAME "other" +#define STATIS_NAME "statis" +#define ALL_NAME "all" + +#define STATIS_FILE DATA_PATH STATIS_NAME ".txt" + +#define OUTPUT_INTERVAL 500000 + + +//×Ö¶Îö¾Ù +enum FileForm +{ + e_sip, e_dip, e_domain, e_qtype, e_qcnt, + e_ratio, e_dir, e_auth, e_rval, e_rtype, + e_rcode, e_ttl, e_time, e_qlen, e_rlen, + e_rother, + e_end, +}; + + +//»®·Ö½á¹¹ +struct Partition +{ + ofstream ofs; + long long cnt = 0; + void Init(const string &name, const string &strHead) + { + string formName = name; + auto lbdForm = [](char ch) { + if(IsIdChar(ch)) + return ch; + else + return '_'; + }; + std::transform(SHOW_BEGIN_END(formName), formName.begin(), lbdForm); + ofs.open(name); + ofs <<strHead; + } +}; + + +//¹æ·¶»¯ÓòÃû +inline bool IsDomainChar(char ch) +{ + return (ch>='0' && ch<='9') + || (ch>='a' && ch<='z') + || ch=='-' || ch=='_' || ch=='.'; +} +inline bool FormDomain(string &domain) +{ + //´óдת»»Ð¡Ð´ + std::transform(domain.begin(), domain.end(), domain.begin(), UppCharToLowChar); + //²âÊÔÊÇ·ñÓÐÆæ¹Ö×Ö·û + if(std::find_if_not(domain.begin(), domain.end(), IsDomainChar)!=domain.end()) + return false; + return true; +} + + +//²éÕÒCDNº¯Êý +inline string *FastFindCdn(AcMachine<char, string> &mtCdn, string &domain) +{ + //´¦Àíºó׺µã + bool bBackDot = domain.back()=='.'; + if(bBackDot) + domain.pop_back(); + //Ôö¼Óкó׺ + domain.push_back('#'); + //Æ¥Åä + auto ret = mtCdn.Judge(SHOW_BEGIN_END(domain)); + domain.pop_back(); + if(bBackDot) + domain.push_back('.'); + return ret; +} + + +//Ìí¼ÓÐÅÏ¢º¯Êý +inline void AddMap(std::map<string, Partition> &mapPar, const string &strHead, + const string &key, const string &strLine) +{ + auto res = mapPar.emplace(piecewise_construct, std::tie(key), make_tuple()); + if(res.second) + res.first->second.Init(DATA_PATH+res.first->first, strHead); + res.first->second.ofs <<strLine <<"\n"; + ++ res.first->second.cnt; +} + + +//ͳ¼Æº¯Êý +inline void Statistic(ofstream &ofs, std::map<string, Partition> &mapPar, long long cntValid) +{ + ofs.seekp(0); + ofs <<std::fixed; + ofs <<ALL_NAME <<" " <<cntValid <<" " <<"100%\n"; + for(auto &pr: mapPar) { + ofs <<pr.first <<" " <<pr.second.cnt <<" " <<(double)pr.second.cnt*100/cntValid <<"%\n"; + pr.second.ofs <<flush; + } + ofs <<"end\n\n\n" <<flush; +} diff --git a/DomainDeal b/DomainDeal Binary files differnew file mode 100644 index 0000000..a4570db --- /dev/null +++ b/DomainDeal diff --git a/DomainDeal.cpp b/DomainDeal.cpp new file mode 100644 index 0000000..28581f5 --- /dev/null +++ b/DomainDeal.cpp @@ -0,0 +1,101 @@ +// DomainDeal.cpp : æ¤æä»¶å
å« "main" 彿°ãç¨åºæ§è¡å°å¨æ¤å¤å¼å§å¹¶ç»æã +// + +#include "Common.h" +#include "Data.h" + + +AssertOperator<> g_asst(cout, AssertOption::thrw_log); + +int main(int argc, char *argv[]) +{ + + //夿忰 + if(argc<2) { + cout <<"input filename as arg" <<endl; + return -1; + } + + //读åæä»¶ + ifstream ifs(argv[1]); + if(!ifs.is_open()) { + cout <<"cant open file" <<endl; + return -1; + } + + cout <<"read metadata start\n"; + //读åå
æ°æ® + AcMachine<char, string> mtCdn; + TrieTree<char, string> mtUrl; + BinReadFile brf(true, true); + g_asst(brf.Open(CDN_FILE) + && (brf >>mtCdn) + && brf.Close(true), + "cant read cdn data\n"); + g_asst(brf.Open(URL_FILE) + && (brf >>mtUrl) + && brf.Close(true), + "cant read url data\n"); + cout <<"read metadata success\n"; + + //ç»ç»è¾åºç»æ + std::map<string, Partition> mapPar; + ofstream ofs(STATIS_FILE); + + //æé¤è¡ + cout <<"deal dns start\n"; + string strHead; + long long cntLine= 0, cntForm= 0, cntValid= 0; + constexpr long long limLine = -1; + constexpr bool bOutHead = false; + for(int i=0; i!=1; ++i) { + string str; + std::getline(ifs, str); + if(bOutHead) + strHead <<str <<"\n"; + ++ cntLine; + } + //æè¡è¯»å + for(string strLine; std::getline(ifs, strLine) + && (limLine<=0 || cntLine<limLine); ++cntLine) + { + //é´éè¾åºä¸é´ç»æ + if(cntLine%OUTPUT_INTERVAL==0) { + Statistic(ofs, mapPar, cntValid); + cout <<"\r" <<cntLine <<" lines" <<flush; + } + //è§£æå段 + istringstream iss(strLine); + vector<string> vec; + for(int i=0; i!=e_end; ++i) { + vec.emplace_back(); + if(!std::getline(iss, vec.back(), '\t')) { + break; + } + } + if(vec.size()!=e_end) + continue; + ++ cntForm; + //å¤çåå + if(!FormDomain(vec[e_domain])) + continue; + ++ cntValid; + //æ¥æ¾CDN + string &strOut = vec[e_domain]; + string *res; + if((res= mtUrl.Judge(vec[e_domain].rbegin(), vec[e_domain].rend()))) + AddMap(mapPar, strHead, *res, strOut); + else if((res= FastFindCdn(mtCdn, vec[e_domain]))) + AddMap(mapPar, strHead, SPCDN_NAME, strOut); + else + AddMap(mapPar, strHead, OTHER_NAME, strOut); + } + cout <<"\ndeal dns success\n"; + cout <<"cntLine: " <<cntLine <<"\n" + <<"cntForm: " <<cntForm <<"\n" + <<"cntValid: " <<cntValid <<"\n"; + + //æç»ç»è®¡ + Statistic(ofs, mapPar, cntValid); +} + |
