diff options
| author | unknown <[email protected]> | 2023-07-29 11:20:27 +0800 |
|---|---|---|
| committer | unknown <[email protected]> | 2023-07-29 11:20:27 +0800 |
| commit | 7592577acc00163e98b45bba86ef76bd37f93854 (patch) | |
| tree | 671fdf3b4715241c84d5b428a4b5dcaa37ac9d6b /code/hunterGraph.py | |
| parent | 5bee0b3e68ef600fea9fe8cbaca0d512a8d21998 (diff) | |
reorganize
Diffstat (limited to 'code/hunterGraph.py')
| -rw-r--r-- | code/hunterGraph.py | 286 |
1 files changed, 286 insertions, 0 deletions
diff --git a/code/hunterGraph.py b/code/hunterGraph.py new file mode 100644 index 0000000..6332552 --- /dev/null +++ b/code/hunterGraph.py @@ -0,0 +1,286 @@ +import networkx as nx +import csv + +def show_connected_subgraphs(edge_file): + G = nx.Graph() + edge_list=[] + with open(edge_file, 'r', encoding="utf-8") as edgefile: + edges = csv.DictReader(edgefile) + for edge in edges: + edge_list.append((edge['node1'],edge['node2'])) + G.add_edges_from(edge_list) + + largest = max(nx.connected_components(G),key=len) + largest_connected_subgraph = G.subgraph(largest) + node_num_list=[] + edge_num_list=[] + + for c in sorted(nx.connected_components(G),key=len,reverse=True): + subgraph=G.subgraph(c) + node_num_list.append(nx.number_of_nodes(subgraph)) + edge_num_list.append(nx.number_of_edges(subgraph)) + # with open("subgraph_edges.txt", 'a+', encoding="utf-8") as f: + # f.write(str(subgraph.edges)+"\n") + + + import matplotlib.pyplot as plt + import numpy as np + + x = np.array(node_num_list) + y = np.array(edge_num_list) + + plt.xlabel("nodes") + plt.ylabel("edges") + plt.scatter(x, y) + plt.show() + +def node_type_count(node_file): + node_count_dict={} + with open(node_file, 'r', encoding="utf-8") as nodefile: + nodes = csv.DictReader(nodefile) + for node in nodes: + if node["type"] in node_count_dict: + node_count_dict[node["type"]]+=1 + else: + node_count_dict[node["type"]]=1 + print(node_count_dict) + +import pandas as pd +def benign_fraud_count(node_file,subgraph_node_file): + nodes = pd.read_csv(node_file, encoding='utf-8') + fraud_count=0 + benign_count=0 + domain_count=0 + IP_count=0 + x_mailer_count=0 + with open(subgraph_node_file, 'r', encoding="utf-8") as f: + line=f.readline().strip() + line=line.replace("['","") + line=line.replace("']","") + sub_nodes=line.split("', '") + # print(nodes['index'].dtypes) + for sub_node in sub_nodes: + # print(type(sub_node)) + node_index=nodes[(nodes['index']==int(sub_node))].index.tolist()[0] + node_name=nodes.at[node_index,'name'] + node_type = nodes.at[node_index, 'type'] + if node_type==0: + node_num=node_name[5:] + if int(node_num) <= 6550 and int(node_num) >=6264: + fraud_count+=1 + else: + benign_count+=1 + else: + + if node_type==1 : + domain_count+=1 + elif node_type==2: + IP_count+=1 + else: + x_mailer_count+=1 + + print("fraud: "+str(fraud_count)) + print("benign: "+str(benign_count)) + print("domain: "+str(domain_count)) + print("IP: "+str(IP_count)) + print("x-mailer: "+ str(x_mailer_count)) + # node2_index = nodes[(nodes['name'] == edge['node2'])].index.tolist()[0] + # edge_list.append(str(node1_index)+","+str(node2_index)) + +def merge_meta_path(edge_file,meta_path_file): + G = nx.Graph() + edge_list=[] + with open(edge_file, 'r', encoding="utf-8") as edgefile: + edges = csv.DictReader(edgefile) + for edge in edges: + edge_list.append((edge['node1'],edge['node2'])) + G.add_edges_from(edge_list) + + # largest = max(nx.connected_components(G),key=len) + # largest_connected_subgraph = G.subgraph(largest) + subgraph_edges=list(G.edges) + # print(subgraph_edges) + meta_path_list=[] + for i in range(0,len(subgraph_edges)): + for j in range(0,len(subgraph_edges)): + if subgraph_edges[i][1] == subgraph_edges[j][0]: + meta_path_list.append((subgraph_edges[i][0],subgraph_edges[i][1],subgraph_edges[j][1])) + elif subgraph_edges[i][1] == subgraph_edges[j][1]: + meta_path_list.append((subgraph_edges[i][0],subgraph_edges[i][1],subgraph_edges[j][0])) + print(meta_path_list) + with open(meta_path_file, 'w', encoding="utf-8") as f: + f.write("node1,path,node2\n") + for meta_path in meta_path_list: + f.write(meta_path[0]+","+meta_path[1]+","+meta_path[2]+"\n") + +def new_index_to_subgraph(edge_file,subgraph_index_file,node_file): + nodes = pd.read_csv(node_file, encoding='utf-8') + G = nx.Graph() + edge_list=[] + with open(edge_file, 'r', encoding="utf-8") as edgefile: + edges = csv.DictReader(edgefile) + for edge in edges: + edge_list.append((edge['node1'],edge['node2'])) + G.add_edges_from(edge_list) + + # largest = max(nx.connected_components(G),key=len) + # largest_connected_subgraph = G.subgraph(largest) + subgraph_edges=list(G.edges) + with open(subgraph_index_file,'w',encoding='utf-8') as new_index_file: + new_index_file.write("oldIndex,newIndex,label\n") + index=1 + new_node_dict={} + for edge in subgraph_edges: + node_index = nodes[(nodes['index'] == int(edge[0]))].index.tolist()[0] + node_name = nodes.at[node_index, 'name'] + node_type = nodes.at[node_index,'type'] + if node_type == 0: + if node_name[5:] in new_node_dict: + continue + else: + if int(node_name[5:])<2010: + new_node_dict[node_name[5:]]=str(index) + new_index_file.write(edge[0]+","+str(index)+",1\n") + index+=1 + else: + new_node_dict[node_name[5:]] = str(index) + new_index_file.write(edge[0] + "," + str(index) + ",0\n") + index+=1 + node_index = nodes[(nodes['index'] == int(edge[1]))].index.tolist()[0] + node_name = nodes.at[node_index, 'name'] + node_type = nodes.at[node_index, 'type'] + if node_type == 0: + if node_name[5:] in new_node_dict: + continue + else: + if int(node_name[5:]) < 2010: + new_node_dict[node_name[5:]] = str(index) + new_index_file.write(edge[1] + "," + str(index) + ",1\n") + index += 1 + else: + new_node_dict[node_name[5:]] = str(index) + new_index_file.write(edge[1] + "," + str(index) + ",0\n") + index += 1 + + +def split_meta_path(node_file,meta_path_file,index_file): + nodes = pd.read_csv(node_file, encoding='utf-8') + indexes = pd.read_csv(index_file,encoding='utf-8') + EDE_list=[] + EIE_list=[] + EXE_list=[] + with open(meta_path_file, 'r', encoding="utf-8") as f: + paths=csv.DictReader(f) + for path in paths: + node_index=nodes[(nodes['index']==int(path['path']))].index.tolist()[0] + node_type = nodes.at[node_index, 'type'] + if node_type == 1: + EDE_list.append((path['node1'],path['node2'])) + elif node_type == 2: + EIE_list.append((path['node1'],path['node2'])) + elif node_type == 3: + EXE_list.append((path['node1'],path['node2'])) + with open("EDE_list.csv",'w',encoding="utf-8") as f: + f.write("eml1,eml2\n") + for ede in EDE_list: + # node1_name = nodes.at[int(ede[0]),'name'] + # node1_num = node1_name[5:] + new_node_index = indexes[(indexes['oldIndex'] == int(ede[0]))].index.tolist()[0] + node1_num = indexes.at[new_node_index,'newIndex'] + # node2_name = nodes.at[int(ede[1]),'name'] + # node2_num = node2_name[5:] + new_node_index = indexes[(indexes['oldIndex'] == int(ede[1]))].index.tolist()[0] + node2_num = indexes.at[new_node_index, 'newIndex'] + f.write(str(node1_num)+","+str(node2_num)+"\n") + with open("EIE_list.csv",'w',encoding="utf-8") as f: + f.write("eml1,eml2\n") + for eie in EIE_list: + # node1_name = nodes.at[int(eie[0]), 'name'] + # node1_num = node1_name[5:] + new_node_index = indexes[(indexes['oldIndex'] == int(eie[0]))].index.tolist()[0] + node1_num = indexes.at[new_node_index, 'newIndex'] + # node2_name = nodes.at[int(eie[1]), 'name'] + # node2_num = node2_name[5:] + new_node_index = indexes[(indexes['oldIndex'] == int(eie[1]))].index.tolist()[0] + node2_num = indexes.at[new_node_index, 'newIndex'] + f.write(str(node1_num)+","+str(node2_num)+"\n") + with open("EXE_list.csv",'w',encoding="utf-8") as f: + f.write("eml1,eml2\n") + for exe in EXE_list: + # node1_name = nodes.at[int(exe[0]), 'name'] + # node1_num = node1_name[5:] + new_node_index = indexes[(indexes['oldIndex'] == int(exe[0]))].index.tolist()[0] + node1_num = indexes.at[new_node_index, 'newIndex'] + # node2_name = nodes.at[int(exe[1]), 'name'] + # node2_num = node2_name[5:] + new_node_index = indexes[(indexes['oldIndex'] == int(exe[1]))].index.tolist()[0] + node2_num = indexes.at[new_node_index, 'newIndex'] + f.write(str(node1_num) + "," + str(node2_num) + "\n") + +import numpy as np +def meta_path_to_matrix(meta_path_file): + num = [[0 for i in range(0, 6975)] for j in range(0, 6975)] + with open(meta_path_file, 'r') as f: + cols = csv.DictReader(f) + for col in cols: + num[int(col["eml1"])-1][int(col["eml2"])-1]=1 + num[int(col["eml2"])-1][int(col["eml1"])-1] = 1 + for i in range(0,6975): + num[i][i] = 1 + arr = np.array(num) + return arr + +def extract_label(label_file): + num = [[0 for i in range(0, 2)] for j in range(0, 6975)] + with open(label_file, 'r') as f: + cols = csv.DictReader(f) + for col in cols: + if int(col["label"]) == 1: + num[int(col["newIndex"])-1][0] = 1 + elif int(col["label"]) ==0: + num[int(col["newIndex"])-1][1] =1 + arr = np.array(num) + return arr + +import random +def generate_features(): + features = [[0 for i in range(0, 8)] for j in range(0, 6975)] + for i in range(0,6975): + length=random.randint(1,8) + for j in range(0,length): + loc = random.randint(0,7) + features[i][loc]=1 + features = np.array(features) + return features + +from scipy.io import savemat +def save_data(EDE_file,EIE_file,EXE_file,label_file,mat_file): + shuffled_index = np.random.permutation(6975) + split_index1 = int(6975 * 0.6) + split_index2 = int(6975*0.8) + train_index = shuffled_index[:split_index1] + train_idx = np.array([train_index]) + val_index = shuffled_index[split_index1:split_index2] + val_idx = np.array([val_index]) + test_index = shuffled_index[split_index2:] + test_idx = np.array([test_index]) + label = extract_label(label_file) + EDE = meta_path_to_matrix(EDE_file) + EIE = meta_path_to_matrix(EIE_file) + EXE = meta_path_to_matrix(EXE_file) + features = generate_features() + savemat(mat_file,{'EIE':EIE,'EDE':EDE,'EXE':EXE,'features':features,'label':label,'train_idx':train_idx,'val_idx':val_idx,'test_idx':test_idx}) + + + +if __name__ =="__main__": + # benign_fraud_count("hunter_node.csv","first_subgraph_nodes1.txt") + # node_type_count("hunter_node.csv") + # show_connected_subgraphs("hunter_edge_index_only.csv") + # merge_meta_path("hunter_edge_index_only.csv","meta_path_original.csv") + # split_meta_path("hunter_node.csv","meta_path_original.csv","subgraph_index.csv") + # meta_path_to_matrix("EDE_list.csv") + # new_index_to_subgraph("hunter_edge_index_only.csv","subgraph_index.csv","hunter_node.csv") + # extract_label("subgraph_index.csv") + save_data("EDE_list.csv","EIE_list.csv","EXE_list.csv","subgraph_index.csv","SG_dataset.mat") + # generate_features()
\ No newline at end of file |
