summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author张硕 <[email protected]>2019-12-31 11:10:43 +0800
committer张硕 <[email protected]>2019-12-31 11:10:43 +0800
commit6fe2e67322cbbbc849c68b930d1b1c1a00b27aec (patch)
treefbce9eaa0f01091b19430145ac4de89afcaaf4a7
parentd5d15da57ed85936cbfe2525d03d7bd195709b18 (diff)
Update Data.h, DomainDeal, DomainDeal.cpp files
-rw-r--r--Data.h110
-rw-r--r--DomainDealbin0 -> 824840 bytes
-rw-r--r--DomainDeal.cpp101
3 files changed, 211 insertions, 0 deletions
diff --git a/Data.h b/Data.h
new file mode 100644
index 0000000..b1e690f
--- /dev/null
+++ b/Data.h
@@ -0,0 +1,110 @@
+#pragma once
+
+
+#define LIB_PATH "lib/"
+#define DATA_PATH "data/"
+#define CDN_FILE LIB_PATH "CdnDomainList.dat"
+#define URL_FILE LIB_PATH "UrlDomainList.dat"
+
+#define SPCDN_NAME "spcdn"
+#define OTHER_NAME "other"
+#define STATIS_NAME "statis"
+#define ALL_NAME "all"
+
+#define STATIS_FILE DATA_PATH STATIS_NAME ".txt"
+
+#define OUTPUT_INTERVAL 500000
+
+
+//×Ö¶Îö¾Ù
+enum FileForm
+{
+ e_sip, e_dip, e_domain, e_qtype, e_qcnt,
+ e_ratio, e_dir, e_auth, e_rval, e_rtype,
+ e_rcode, e_ttl, e_time, e_qlen, e_rlen,
+ e_rother,
+ e_end,
+};
+
+
+//»®·Ö½á¹¹
+struct Partition
+{
+ ofstream ofs;
+ long long cnt = 0;
+ void Init(const string &name, const string &strHead)
+ {
+ string formName = name;
+ auto lbdForm = [](char ch) {
+ if(IsIdChar(ch))
+ return ch;
+ else
+ return '_';
+ };
+ std::transform(SHOW_BEGIN_END(formName), formName.begin(), lbdForm);
+ ofs.open(name);
+ ofs <<strHead;
+ }
+};
+
+
+//¹æ·¶»¯ÓòÃû
+inline bool IsDomainChar(char ch)
+{
+ return (ch>='0' && ch<='9')
+ || (ch>='a' && ch<='z')
+ || ch=='-' || ch=='_' || ch=='.';
+}
+inline bool FormDomain(string &domain)
+{
+ //´óдת»»Ð¡Ð´
+ std::transform(domain.begin(), domain.end(), domain.begin(), UppCharToLowChar);
+ //²âÊÔÊÇ·ñÓÐÆæ¹Ö×Ö·û
+ if(std::find_if_not(domain.begin(), domain.end(), IsDomainChar)!=domain.end())
+ return false;
+ return true;
+}
+
+
+//²éÕÒCDNº¯Êý
+inline string *FastFindCdn(AcMachine<char, string> &mtCdn, string &domain)
+{
+ //´¦Àíºó׺µã
+ bool bBackDot = domain.back()=='.';
+ if(bBackDot)
+ domain.pop_back();
+ //Ôö¼Óкó׺
+ domain.push_back('#');
+ //Æ¥Åä
+ auto ret = mtCdn.Judge(SHOW_BEGIN_END(domain));
+ domain.pop_back();
+ if(bBackDot)
+ domain.push_back('.');
+ return ret;
+}
+
+
+//Ìí¼ÓÐÅÏ¢º¯Êý
+inline void AddMap(std::map<string, Partition> &mapPar, const string &strHead,
+ const string &key, const string &strLine)
+{
+ auto res = mapPar.emplace(piecewise_construct, std::tie(key), make_tuple());
+ if(res.second)
+ res.first->second.Init(DATA_PATH+res.first->first, strHead);
+ res.first->second.ofs <<strLine <<"\n";
+ ++ res.first->second.cnt;
+}
+
+
+//ͳ¼Æº¯Êý
+inline void Statistic(ofstream &ofs, std::map<string, Partition> &mapPar, long long cntValid)
+{
+ ofs.seekp(0);
+ ofs <<std::fixed;
+ ofs <<ALL_NAME <<" " <<cntValid <<" " <<"100%\n";
+ for(auto &pr: mapPar) {
+ ofs <<pr.first <<" " <<pr.second.cnt <<" " <<(double)pr.second.cnt*100/cntValid <<"%\n";
+ pr.second.ofs <<flush;
+ }
+ ofs <<"end\n\n\n" <<flush;
+}
diff --git a/DomainDeal b/DomainDeal
new file mode 100644
index 0000000..a4570db
--- /dev/null
+++ b/DomainDeal
Binary files differ
diff --git a/DomainDeal.cpp b/DomainDeal.cpp
new file mode 100644
index 0000000..28581f5
--- /dev/null
+++ b/DomainDeal.cpp
@@ -0,0 +1,101 @@
+// DomainDeal.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
+//
+
+#include "Common.h"
+#include "Data.h"
+
+
+AssertOperator<> g_asst(cout, AssertOption::thrw_log);
+
+int main(int argc, char *argv[])
+{
+
+ //判断参数
+ if(argc<2) {
+ cout <<"input filename as arg" <<endl;
+ return -1;
+ }
+
+ //读取文件
+ ifstream ifs(argv[1]);
+ if(!ifs.is_open()) {
+ cout <<"cant open file" <<endl;
+ return -1;
+ }
+
+ cout <<"read metadata start\n";
+ //读取元数据
+ AcMachine<char, string> mtCdn;
+ TrieTree<char, string> mtUrl;
+ BinReadFile brf(true, true);
+ g_asst(brf.Open(CDN_FILE)
+ && (brf >>mtCdn)
+ && brf.Close(true),
+ "cant read cdn data\n");
+ g_asst(brf.Open(URL_FILE)
+ && (brf >>mtUrl)
+ && brf.Close(true),
+ "cant read url data\n");
+ cout <<"read metadata success\n";
+
+ //组织输出结构
+ std::map<string, Partition> mapPar;
+ ofstream ofs(STATIS_FILE);
+
+ //排除行
+ cout <<"deal dns start\n";
+ string strHead;
+ long long cntLine= 0, cntForm= 0, cntValid= 0;
+ constexpr long long limLine = -1;
+ constexpr bool bOutHead = false;
+ for(int i=0; i!=1; ++i) {
+ string str;
+ std::getline(ifs, str);
+ if(bOutHead)
+ strHead <<str <<"\n";
+ ++ cntLine;
+ }
+ //按行读取
+ for(string strLine; std::getline(ifs, strLine)
+ && (limLine<=0 || cntLine<limLine); ++cntLine)
+ {
+ //间隔输出中间结果
+ if(cntLine%OUTPUT_INTERVAL==0) {
+ Statistic(ofs, mapPar, cntValid);
+ cout <<"\r" <<cntLine <<" lines" <<flush;
+ }
+ //解析字段
+ istringstream iss(strLine);
+ vector<string> vec;
+ for(int i=0; i!=e_end; ++i) {
+ vec.emplace_back();
+ if(!std::getline(iss, vec.back(), '\t')) {
+ break;
+ }
+ }
+ if(vec.size()!=e_end)
+ continue;
+ ++ cntForm;
+ //处理域名
+ if(!FormDomain(vec[e_domain]))
+ continue;
+ ++ cntValid;
+ //查找CDN
+ string &strOut = vec[e_domain];
+ string *res;
+ if((res= mtUrl.Judge(vec[e_domain].rbegin(), vec[e_domain].rend())))
+ AddMap(mapPar, strHead, *res, strOut);
+ else if((res= FastFindCdn(mtCdn, vec[e_domain])))
+ AddMap(mapPar, strHead, SPCDN_NAME, strOut);
+ else
+ AddMap(mapPar, strHead, OTHER_NAME, strOut);
+ }
+ cout <<"\ndeal dns success\n";
+ cout <<"cntLine: " <<cntLine <<"\n"
+ <<"cntForm: " <<cntForm <<"\n"
+ <<"cntValid: " <<cntValid <<"\n";
+
+ //最终统计
+ Statistic(ofs, mapPar, cntValid);
+}
+