diff options
| author | unknown <[email protected]> | 2023-07-29 11:20:27 +0800 |
|---|---|---|
| committer | unknown <[email protected]> | 2023-07-29 11:20:27 +0800 |
| commit | 7592577acc00163e98b45bba86ef76bd37f93854 (patch) | |
| tree | 671fdf3b4715241c84d5b428a4b5dcaa37ac9d6b /code/analysisDataset.py | |
| parent | 5bee0b3e68ef600fea9fe8cbaca0d512a8d21998 (diff) | |
reorganize
Diffstat (limited to 'code/analysisDataset.py')
| -rw-r--r-- | code/analysisDataset.py | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/code/analysisDataset.py b/code/analysisDataset.py new file mode 100644 index 0000000..34c7a06 --- /dev/null +++ b/code/analysisDataset.py @@ -0,0 +1,47 @@ +from parseEml import parseEml +import os + +def read_spf(email_path): + mail = parseEml(email_path) + auth_result = mail.get_auth_results() + if auth_result and "spf=" in auth_result: + tmp_list = auth_result.split("spf=") + spf_result=tmp_list[1].split(" ")[0] + else: + spf_result=None + return spf_result + +#输入:邮件文件夹路径 +#输出:spf认证结果统计dict +def spf_count(email_folder): + spf_count_dict={} + files = os.listdir(email_folder) + print(email_folder+":\ntotal: "+str(len(files))) + for file in files: # 遍历文件夹 + if file == "duplicate": + continue + spf_result=read_spf(email_folder+"/"+file) + if spf_result in spf_count_dict: + spf_count_dict[spf_result]+=1 + else: + spf_count_dict[spf_result]=1 + print(spf_count_dict) + +#输入:中间域名列表 +#输出:域名出现次数统计结果 +def inter_domain_count(inter_domain_file): + count_dict={} + with open(inter_domain_file,"r",encoding="utf-8") as f: + line = f.readline() + while line: + line=line.strip() + if line in count_dict: + count_dict[line]+=1 + else: + count_dict[line]=1 + line = f.readline() + a = sorted(count_dict.items(), key=lambda x: x[1], reverse=True) + print(a) + +if __name__ == "__main__": + inter_domain_count("inter_domain.txt")
\ No newline at end of file |
