summaryrefslogtreecommitdiff
path: root/code/analysisDataset.py
diff options
context:
space:
mode:
authorunknown <[email protected]>2023-07-29 11:20:27 +0800
committerunknown <[email protected]>2023-07-29 11:20:27 +0800
commit7592577acc00163e98b45bba86ef76bd37f93854 (patch)
tree671fdf3b4715241c84d5b428a4b5dcaa37ac9d6b /code/analysisDataset.py
parent5bee0b3e68ef600fea9fe8cbaca0d512a8d21998 (diff)
reorganize
Diffstat (limited to 'code/analysisDataset.py')
-rw-r--r--code/analysisDataset.py47
1 files changed, 47 insertions, 0 deletions
diff --git a/code/analysisDataset.py b/code/analysisDataset.py
new file mode 100644
index 0000000..34c7a06
--- /dev/null
+++ b/code/analysisDataset.py
@@ -0,0 +1,47 @@
+from parseEml import parseEml
+import os
+
+def read_spf(email_path):
+ mail = parseEml(email_path)
+ auth_result = mail.get_auth_results()
+ if auth_result and "spf=" in auth_result:
+ tmp_list = auth_result.split("spf=")
+ spf_result=tmp_list[1].split(" ")[0]
+ else:
+ spf_result=None
+ return spf_result
+
+#输入:邮件文件夹路径
+#输出:spf认证结果统计dict
+def spf_count(email_folder):
+ spf_count_dict={}
+ files = os.listdir(email_folder)
+ print(email_folder+":\ntotal: "+str(len(files)))
+ for file in files: # 遍历文件夹
+ if file == "duplicate":
+ continue
+ spf_result=read_spf(email_folder+"/"+file)
+ if spf_result in spf_count_dict:
+ spf_count_dict[spf_result]+=1
+ else:
+ spf_count_dict[spf_result]=1
+ print(spf_count_dict)
+
+#输入:中间域名列表
+#输出:域名出现次数统计结果
+def inter_domain_count(inter_domain_file):
+ count_dict={}
+ with open(inter_domain_file,"r",encoding="utf-8") as f:
+ line = f.readline()
+ while line:
+ line=line.strip()
+ if line in count_dict:
+ count_dict[line]+=1
+ else:
+ count_dict[line]=1
+ line = f.readline()
+ a = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
+ print(a)
+
+if __name__ == "__main__":
+ inter_domain_count("inter_domain.txt") \ No newline at end of file