summaryrefslogtreecommitdiff
path: root/DomainDeal.cpp
blob: 28581f5ac2c12a22af55e48f79ee1e3b1740d8d0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// DomainDeal.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
//

#include "Common.h"
#include "Data.h"


AssertOperator<> g_asst(cout, AssertOption::thrw_log);

int main(int argc, char *argv[])
{
	
	//判断参数
	if(argc<2) {
		cout <<"input filename as arg" <<endl;
		return -1;
	}

	//读取文件
	ifstream ifs(argv[1]);
	if(!ifs.is_open()) {
		cout <<"cant open file" <<endl;
		return -1;
	}

	cout <<"read metadata start\n";
	//读取元数据
	AcMachine<char, string> mtCdn;
	TrieTree<char, string> mtUrl;
	BinReadFile brf(true, true);
	g_asst(brf.Open(CDN_FILE)
		&& (brf >>mtCdn)
		&& brf.Close(true),
		"cant read cdn data\n");
	g_asst(brf.Open(URL_FILE)
		&& (brf >>mtUrl)
		&& brf.Close(true),
		"cant read url data\n");
	cout <<"read metadata success\n";

	//组织输出结构
	std::map<string, Partition> mapPar;
	ofstream ofs(STATIS_FILE);

	//排除行
	cout <<"deal dns start\n";
	string strHead;
	long long cntLine= 0, cntForm= 0, cntValid= 0;
	constexpr long long limLine = -1;
	constexpr bool bOutHead = false;
	for(int i=0; i!=1; ++i) {
		string str;
		std::getline(ifs, str);
		if(bOutHead)
			strHead <<str <<"\n";
		++ cntLine;
	}
	//按行读取
	for(string strLine; std::getline(ifs, strLine)
		&& (limLine<=0 || cntLine<limLine); ++cntLine)
	{
		//间隔输出中间结果
		if(cntLine%OUTPUT_INTERVAL==0) {
			Statistic(ofs, mapPar, cntValid);
			cout <<"\r" <<cntLine <<" lines" <<flush;
		}
		//解析字段
		istringstream iss(strLine);
		vector<string> vec;
		for(int i=0; i!=e_end; ++i) {
			vec.emplace_back();
			if(!std::getline(iss, vec.back(), '\t')) {
				break;
			}
		}
		if(vec.size()!=e_end)
			continue;
		++ cntForm;
		//处理域名
		if(!FormDomain(vec[e_domain]))
			continue;
		++ cntValid;
		//查找CDN
		string &strOut = vec[e_domain];
		string *res;
		if((res= mtUrl.Judge(vec[e_domain].rbegin(), vec[e_domain].rend())))
			AddMap(mapPar, strHead, *res, strOut);
		else if((res= FastFindCdn(mtCdn, vec[e_domain])))
			AddMap(mapPar, strHead, SPCDN_NAME, strOut);
		else
			AddMap(mapPar, strHead, OTHER_NAME, strOut);
	}
	cout <<"\ndeal dns success\n";
	cout <<"cntLine: " <<cntLine <<"\n"
		<<"cntForm: " <<cntForm <<"\n"
		<<"cntValid: " <<cntValid <<"\n";

	//最终统计
	Statistic(ofs, mapPar, cntValid);
}