summaryrefslogtreecommitdiff
path: root/code/hunterGraph.py
diff options
context:
space:
mode:
authorunknown <[email protected]>2023-07-29 11:20:27 +0800
committerunknown <[email protected]>2023-07-29 11:20:27 +0800
commit7592577acc00163e98b45bba86ef76bd37f93854 (patch)
tree671fdf3b4715241c84d5b428a4b5dcaa37ac9d6b /code/hunterGraph.py
parent5bee0b3e68ef600fea9fe8cbaca0d512a8d21998 (diff)
reorganize
Diffstat (limited to 'code/hunterGraph.py')
-rw-r--r--code/hunterGraph.py286
1 files changed, 286 insertions, 0 deletions
diff --git a/code/hunterGraph.py b/code/hunterGraph.py
new file mode 100644
index 0000000..6332552
--- /dev/null
+++ b/code/hunterGraph.py
@@ -0,0 +1,286 @@
+import networkx as nx
+import csv
+
+def show_connected_subgraphs(edge_file):
+ G = nx.Graph()
+ edge_list=[]
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ edges = csv.DictReader(edgefile)
+ for edge in edges:
+ edge_list.append((edge['node1'],edge['node2']))
+ G.add_edges_from(edge_list)
+
+ largest = max(nx.connected_components(G),key=len)
+ largest_connected_subgraph = G.subgraph(largest)
+ node_num_list=[]
+ edge_num_list=[]
+
+ for c in sorted(nx.connected_components(G),key=len,reverse=True):
+ subgraph=G.subgraph(c)
+ node_num_list.append(nx.number_of_nodes(subgraph))
+ edge_num_list.append(nx.number_of_edges(subgraph))
+ # with open("subgraph_edges.txt", 'a+', encoding="utf-8") as f:
+ # f.write(str(subgraph.edges)+"\n")
+
+
+ import matplotlib.pyplot as plt
+ import numpy as np
+
+ x = np.array(node_num_list)
+ y = np.array(edge_num_list)
+
+ plt.xlabel("nodes")
+ plt.ylabel("edges")
+ plt.scatter(x, y)
+ plt.show()
+
+def node_type_count(node_file):
+ node_count_dict={}
+ with open(node_file, 'r', encoding="utf-8") as nodefile:
+ nodes = csv.DictReader(nodefile)
+ for node in nodes:
+ if node["type"] in node_count_dict:
+ node_count_dict[node["type"]]+=1
+ else:
+ node_count_dict[node["type"]]=1
+ print(node_count_dict)
+
+import pandas as pd
+def benign_fraud_count(node_file,subgraph_node_file):
+ nodes = pd.read_csv(node_file, encoding='utf-8')
+ fraud_count=0
+ benign_count=0
+ domain_count=0
+ IP_count=0
+ x_mailer_count=0
+ with open(subgraph_node_file, 'r', encoding="utf-8") as f:
+ line=f.readline().strip()
+ line=line.replace("['","")
+ line=line.replace("']","")
+ sub_nodes=line.split("', '")
+ # print(nodes['index'].dtypes)
+ for sub_node in sub_nodes:
+ # print(type(sub_node))
+ node_index=nodes[(nodes['index']==int(sub_node))].index.tolist()[0]
+ node_name=nodes.at[node_index,'name']
+ node_type = nodes.at[node_index, 'type']
+ if node_type==0:
+ node_num=node_name[5:]
+ if int(node_num) <= 6550 and int(node_num) >=6264:
+ fraud_count+=1
+ else:
+ benign_count+=1
+ else:
+
+ if node_type==1 :
+ domain_count+=1
+ elif node_type==2:
+ IP_count+=1
+ else:
+ x_mailer_count+=1
+
+ print("fraud: "+str(fraud_count))
+ print("benign: "+str(benign_count))
+ print("domain: "+str(domain_count))
+ print("IP: "+str(IP_count))
+ print("x-mailer: "+ str(x_mailer_count))
+ # node2_index = nodes[(nodes['name'] == edge['node2'])].index.tolist()[0]
+ # edge_list.append(str(node1_index)+","+str(node2_index))
+
+def merge_meta_path(edge_file,meta_path_file):
+ G = nx.Graph()
+ edge_list=[]
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ edges = csv.DictReader(edgefile)
+ for edge in edges:
+ edge_list.append((edge['node1'],edge['node2']))
+ G.add_edges_from(edge_list)
+
+ # largest = max(nx.connected_components(G),key=len)
+ # largest_connected_subgraph = G.subgraph(largest)
+ subgraph_edges=list(G.edges)
+ # print(subgraph_edges)
+ meta_path_list=[]
+ for i in range(0,len(subgraph_edges)):
+ for j in range(0,len(subgraph_edges)):
+ if subgraph_edges[i][1] == subgraph_edges[j][0]:
+ meta_path_list.append((subgraph_edges[i][0],subgraph_edges[i][1],subgraph_edges[j][1]))
+ elif subgraph_edges[i][1] == subgraph_edges[j][1]:
+ meta_path_list.append((subgraph_edges[i][0],subgraph_edges[i][1],subgraph_edges[j][0]))
+ print(meta_path_list)
+ with open(meta_path_file, 'w', encoding="utf-8") as f:
+ f.write("node1,path,node2\n")
+ for meta_path in meta_path_list:
+ f.write(meta_path[0]+","+meta_path[1]+","+meta_path[2]+"\n")
+
+def new_index_to_subgraph(edge_file,subgraph_index_file,node_file):
+ nodes = pd.read_csv(node_file, encoding='utf-8')
+ G = nx.Graph()
+ edge_list=[]
+ with open(edge_file, 'r', encoding="utf-8") as edgefile:
+ edges = csv.DictReader(edgefile)
+ for edge in edges:
+ edge_list.append((edge['node1'],edge['node2']))
+ G.add_edges_from(edge_list)
+
+ # largest = max(nx.connected_components(G),key=len)
+ # largest_connected_subgraph = G.subgraph(largest)
+ subgraph_edges=list(G.edges)
+ with open(subgraph_index_file,'w',encoding='utf-8') as new_index_file:
+ new_index_file.write("oldIndex,newIndex,label\n")
+ index=1
+ new_node_dict={}
+ for edge in subgraph_edges:
+ node_index = nodes[(nodes['index'] == int(edge[0]))].index.tolist()[0]
+ node_name = nodes.at[node_index, 'name']
+ node_type = nodes.at[node_index,'type']
+ if node_type == 0:
+ if node_name[5:] in new_node_dict:
+ continue
+ else:
+ if int(node_name[5:])<2010:
+ new_node_dict[node_name[5:]]=str(index)
+ new_index_file.write(edge[0]+","+str(index)+",1\n")
+ index+=1
+ else:
+ new_node_dict[node_name[5:]] = str(index)
+ new_index_file.write(edge[0] + "," + str(index) + ",0\n")
+ index+=1
+ node_index = nodes[(nodes['index'] == int(edge[1]))].index.tolist()[0]
+ node_name = nodes.at[node_index, 'name']
+ node_type = nodes.at[node_index, 'type']
+ if node_type == 0:
+ if node_name[5:] in new_node_dict:
+ continue
+ else:
+ if int(node_name[5:]) < 2010:
+ new_node_dict[node_name[5:]] = str(index)
+ new_index_file.write(edge[1] + "," + str(index) + ",1\n")
+ index += 1
+ else:
+ new_node_dict[node_name[5:]] = str(index)
+ new_index_file.write(edge[1] + "," + str(index) + ",0\n")
+ index += 1
+
+
+def split_meta_path(node_file,meta_path_file,index_file):
+ nodes = pd.read_csv(node_file, encoding='utf-8')
+ indexes = pd.read_csv(index_file,encoding='utf-8')
+ EDE_list=[]
+ EIE_list=[]
+ EXE_list=[]
+ with open(meta_path_file, 'r', encoding="utf-8") as f:
+ paths=csv.DictReader(f)
+ for path in paths:
+ node_index=nodes[(nodes['index']==int(path['path']))].index.tolist()[0]
+ node_type = nodes.at[node_index, 'type']
+ if node_type == 1:
+ EDE_list.append((path['node1'],path['node2']))
+ elif node_type == 2:
+ EIE_list.append((path['node1'],path['node2']))
+ elif node_type == 3:
+ EXE_list.append((path['node1'],path['node2']))
+ with open("EDE_list.csv",'w',encoding="utf-8") as f:
+ f.write("eml1,eml2\n")
+ for ede in EDE_list:
+ # node1_name = nodes.at[int(ede[0]),'name']
+ # node1_num = node1_name[5:]
+ new_node_index = indexes[(indexes['oldIndex'] == int(ede[0]))].index.tolist()[0]
+ node1_num = indexes.at[new_node_index,'newIndex']
+ # node2_name = nodes.at[int(ede[1]),'name']
+ # node2_num = node2_name[5:]
+ new_node_index = indexes[(indexes['oldIndex'] == int(ede[1]))].index.tolist()[0]
+ node2_num = indexes.at[new_node_index, 'newIndex']
+ f.write(str(node1_num)+","+str(node2_num)+"\n")
+ with open("EIE_list.csv",'w',encoding="utf-8") as f:
+ f.write("eml1,eml2\n")
+ for eie in EIE_list:
+ # node1_name = nodes.at[int(eie[0]), 'name']
+ # node1_num = node1_name[5:]
+ new_node_index = indexes[(indexes['oldIndex'] == int(eie[0]))].index.tolist()[0]
+ node1_num = indexes.at[new_node_index, 'newIndex']
+ # node2_name = nodes.at[int(eie[1]), 'name']
+ # node2_num = node2_name[5:]
+ new_node_index = indexes[(indexes['oldIndex'] == int(eie[1]))].index.tolist()[0]
+ node2_num = indexes.at[new_node_index, 'newIndex']
+ f.write(str(node1_num)+","+str(node2_num)+"\n")
+ with open("EXE_list.csv",'w',encoding="utf-8") as f:
+ f.write("eml1,eml2\n")
+ for exe in EXE_list:
+ # node1_name = nodes.at[int(exe[0]), 'name']
+ # node1_num = node1_name[5:]
+ new_node_index = indexes[(indexes['oldIndex'] == int(exe[0]))].index.tolist()[0]
+ node1_num = indexes.at[new_node_index, 'newIndex']
+ # node2_name = nodes.at[int(exe[1]), 'name']
+ # node2_num = node2_name[5:]
+ new_node_index = indexes[(indexes['oldIndex'] == int(exe[1]))].index.tolist()[0]
+ node2_num = indexes.at[new_node_index, 'newIndex']
+ f.write(str(node1_num) + "," + str(node2_num) + "\n")
+
+import numpy as np
+def meta_path_to_matrix(meta_path_file):
+ num = [[0 for i in range(0, 6975)] for j in range(0, 6975)]
+ with open(meta_path_file, 'r') as f:
+ cols = csv.DictReader(f)
+ for col in cols:
+ num[int(col["eml1"])-1][int(col["eml2"])-1]=1
+ num[int(col["eml2"])-1][int(col["eml1"])-1] = 1
+ for i in range(0,6975):
+ num[i][i] = 1
+ arr = np.array(num)
+ return arr
+
+def extract_label(label_file):
+ num = [[0 for i in range(0, 2)] for j in range(0, 6975)]
+ with open(label_file, 'r') as f:
+ cols = csv.DictReader(f)
+ for col in cols:
+ if int(col["label"]) == 1:
+ num[int(col["newIndex"])-1][0] = 1
+ elif int(col["label"]) ==0:
+ num[int(col["newIndex"])-1][1] =1
+ arr = np.array(num)
+ return arr
+
+import random
+def generate_features():
+ features = [[0 for i in range(0, 8)] for j in range(0, 6975)]
+ for i in range(0,6975):
+ length=random.randint(1,8)
+ for j in range(0,length):
+ loc = random.randint(0,7)
+ features[i][loc]=1
+ features = np.array(features)
+ return features
+
+from scipy.io import savemat
+def save_data(EDE_file,EIE_file,EXE_file,label_file,mat_file):
+ shuffled_index = np.random.permutation(6975)
+ split_index1 = int(6975 * 0.6)
+ split_index2 = int(6975*0.8)
+ train_index = shuffled_index[:split_index1]
+ train_idx = np.array([train_index])
+ val_index = shuffled_index[split_index1:split_index2]
+ val_idx = np.array([val_index])
+ test_index = shuffled_index[split_index2:]
+ test_idx = np.array([test_index])
+ label = extract_label(label_file)
+ EDE = meta_path_to_matrix(EDE_file)
+ EIE = meta_path_to_matrix(EIE_file)
+ EXE = meta_path_to_matrix(EXE_file)
+ features = generate_features()
+ savemat(mat_file,{'EIE':EIE,'EDE':EDE,'EXE':EXE,'features':features,'label':label,'train_idx':train_idx,'val_idx':val_idx,'test_idx':test_idx})
+
+
+
+if __name__ =="__main__":
+ # benign_fraud_count("hunter_node.csv","first_subgraph_nodes1.txt")
+ # node_type_count("hunter_node.csv")
+ # show_connected_subgraphs("hunter_edge_index_only.csv")
+ # merge_meta_path("hunter_edge_index_only.csv","meta_path_original.csv")
+ # split_meta_path("hunter_node.csv","meta_path_original.csv","subgraph_index.csv")
+ # meta_path_to_matrix("EDE_list.csv")
+ # new_index_to_subgraph("hunter_edge_index_only.csv","subgraph_index.csv","hunter_node.csv")
+ # extract_label("subgraph_index.csv")
+ save_data("EDE_list.csv","EIE_list.csv","EXE_list.csv","subgraph_index.csv","SG_dataset.mat")
+ # generate_features() \ No newline at end of file