import networkx as nx
import csv

def show_connected_subgraphs(edge_file):
    G = nx.Graph()
    edge_list=[]
    with open(edge_file, 'r', encoding="utf-8") as edgefile:
        edges = csv.DictReader(edgefile)
        for edge in edges:
            edge_list.append((edge['node1'],edge['node2']))
    G.add_edges_from(edge_list)

    largest = max(nx.connected_components(G),key=len)
    largest_connected_subgraph = G.subgraph(largest)
    node_num_list=[]
    edge_num_list=[]

    for c in sorted(nx.connected_components(G),key=len,reverse=True):
        subgraph=G.subgraph(c)
        node_num_list.append(nx.number_of_nodes(subgraph))
        edge_num_list.append(nx.number_of_edges(subgraph))
        # with open("subgraph_edges.txt", 'a+', encoding="utf-8") as f:
        #     f.write(str(subgraph.edges)+"\n")


    import matplotlib.pyplot as plt
    import numpy as np

    x = np.array(node_num_list)
    y = np.array(edge_num_list)

    plt.xlabel("nodes")
    plt.ylabel("edges")
    plt.scatter(x, y)
    plt.show()

def node_type_count(node_file):
    node_count_dict={}
    with open(node_file, 'r', encoding="utf-8") as nodefile:
        nodes = csv.DictReader(nodefile)
        for node in nodes:
            if node["type"] in node_count_dict:
                node_count_dict[node["type"]]+=1
            else:
                node_count_dict[node["type"]]=1
    print(node_count_dict)

import pandas as pd
def benign_fraud_count(node_file,subgraph_node_file):
    nodes = pd.read_csv(node_file, encoding='utf-8')
    fraud_count=0
    benign_count=0
    domain_count=0
    IP_count=0
    x_mailer_count=0
    with open(subgraph_node_file, 'r', encoding="utf-8") as f:
        line=f.readline().strip()
        line=line.replace("['","")
        line=line.replace("']","")
        sub_nodes=line.split("', '")
        # print(nodes['index'].dtypes)
        for sub_node in sub_nodes:
            # print(type(sub_node))
            node_index=nodes[(nodes['index']==int(sub_node))].index.tolist()[0]
            node_name=nodes.at[node_index,'name']
            node_type = nodes.at[node_index, 'type']
            if node_type==0:
                node_num=node_name[5:]
                if int(node_num) <= 6550 and int(node_num) >=6264:
                    fraud_count+=1
                else:
                    benign_count+=1
            else:

                if node_type==1 :
                    domain_count+=1
                elif node_type==2:
                    IP_count+=1
                else:
                    x_mailer_count+=1

    print("fraud: "+str(fraud_count))
    print("benign: "+str(benign_count))
    print("domain: "+str(domain_count))
    print("IP: "+str(IP_count))
    print("x-mailer: "+ str(x_mailer_count))
        #     node2_index = nodes[(nodes['name'] == edge['node2'])].index.tolist()[0]
        #     edge_list.append(str(node1_index)+","+str(node2_index))

def merge_meta_path(edge_file,meta_path_file):
    G = nx.Graph()
    edge_list=[]
    with open(edge_file, 'r', encoding="utf-8") as edgefile:
        edges = csv.DictReader(edgefile)
        for edge in edges:
            edge_list.append((edge['node1'],edge['node2']))
    G.add_edges_from(edge_list)

    # largest = max(nx.connected_components(G),key=len)
    # largest_connected_subgraph = G.subgraph(largest)
    subgraph_edges=list(G.edges)
    # print(subgraph_edges)
    meta_path_list=[]
    for i in range(0,len(subgraph_edges)):
        for j in range(0,len(subgraph_edges)):
            if subgraph_edges[i][1] == subgraph_edges[j][0]:
                meta_path_list.append((subgraph_edges[i][0],subgraph_edges[i][1],subgraph_edges[j][1]))
            elif subgraph_edges[i][1] == subgraph_edges[j][1]:
                meta_path_list.append((subgraph_edges[i][0],subgraph_edges[i][1],subgraph_edges[j][0]))
    print(meta_path_list)
    with open(meta_path_file, 'w', encoding="utf-8") as f:
        f.write("node1,path,node2\n")
        for meta_path in meta_path_list:
            f.write(meta_path[0]+","+meta_path[1]+","+meta_path[2]+"\n")

def new_index_to_subgraph(edge_file,subgraph_index_file,node_file):
    nodes = pd.read_csv(node_file, encoding='utf-8')
    G = nx.Graph()
    edge_list=[]
    with open(edge_file, 'r', encoding="utf-8") as edgefile:
        edges = csv.DictReader(edgefile)
        for edge in edges:
            edge_list.append((edge['node1'],edge['node2']))
    G.add_edges_from(edge_list)

    # largest = max(nx.connected_components(G),key=len)
    # largest_connected_subgraph = G.subgraph(largest)
    subgraph_edges=list(G.edges)
    with open(subgraph_index_file,'w',encoding='utf-8') as new_index_file:
        new_index_file.write("oldIndex,newIndex,label\n")
        index=1
        new_node_dict={}
        for edge in subgraph_edges:
            node_index = nodes[(nodes['index'] == int(edge[0]))].index.tolist()[0]
            node_name = nodes.at[node_index, 'name']
            node_type = nodes.at[node_index,'type']
            if node_type == 0:
                if node_name[5:] in new_node_dict:
                    continue
                else:
                    if int(node_name[5:])<2010:
                        new_node_dict[node_name[5:]]=str(index)
                        new_index_file.write(edge[0]+","+str(index)+",1\n")
                        index+=1
                    else:
                        new_node_dict[node_name[5:]] = str(index)
                        new_index_file.write(edge[0] + "," + str(index) + ",0\n")
                        index+=1
            node_index = nodes[(nodes['index'] == int(edge[1]))].index.tolist()[0]
            node_name = nodes.at[node_index, 'name']
            node_type = nodes.at[node_index, 'type']
            if node_type == 0:
                if node_name[5:] in new_node_dict:
                    continue
                else:
                    if int(node_name[5:]) < 2010:
                        new_node_dict[node_name[5:]] = str(index)
                        new_index_file.write(edge[1] + "," + str(index) + ",1\n")
                        index += 1
                    else:
                        new_node_dict[node_name[5:]] = str(index)
                        new_index_file.write(edge[1] + "," + str(index) + ",0\n")
                        index += 1


def split_meta_path(node_file,meta_path_file,index_file):
    nodes = pd.read_csv(node_file, encoding='utf-8')
    indexes = pd.read_csv(index_file,encoding='utf-8')
    EDE_list=[]
    EIE_list=[]
    EXE_list=[]
    with open(meta_path_file, 'r', encoding="utf-8") as f:
        paths=csv.DictReader(f)
        for path in paths:
            node_index=nodes[(nodes['index']==int(path['path']))].index.tolist()[0]
            node_type = nodes.at[node_index, 'type']
            if node_type == 1:
                EDE_list.append((path['node1'],path['node2']))
            elif node_type == 2:
                EIE_list.append((path['node1'],path['node2']))
            elif node_type == 3:
                EXE_list.append((path['node1'],path['node2']))
    with open("EDE_list.csv",'w',encoding="utf-8") as f:
        f.write("eml1,eml2\n")
        for ede in EDE_list:
            # node1_name = nodes.at[int(ede[0]),'name']
            # node1_num = node1_name[5:]
            new_node_index = indexes[(indexes['oldIndex'] == int(ede[0]))].index.tolist()[0]
            node1_num = indexes.at[new_node_index,'newIndex']
            # node2_name = nodes.at[int(ede[1]),'name']
            # node2_num = node2_name[5:]
            new_node_index = indexes[(indexes['oldIndex'] == int(ede[1]))].index.tolist()[0]
            node2_num = indexes.at[new_node_index, 'newIndex']
            f.write(str(node1_num)+","+str(node2_num)+"\n")
    with open("EIE_list.csv",'w',encoding="utf-8") as f:
        f.write("eml1,eml2\n")
        for eie in EIE_list:
            # node1_name = nodes.at[int(eie[0]), 'name']
            # node1_num = node1_name[5:]
            new_node_index = indexes[(indexes['oldIndex'] == int(eie[0]))].index.tolist()[0]
            node1_num = indexes.at[new_node_index, 'newIndex']
            # node2_name = nodes.at[int(eie[1]), 'name']
            # node2_num = node2_name[5:]
            new_node_index = indexes[(indexes['oldIndex'] == int(eie[1]))].index.tolist()[0]
            node2_num = indexes.at[new_node_index, 'newIndex']
            f.write(str(node1_num)+","+str(node2_num)+"\n")
    with open("EXE_list.csv",'w',encoding="utf-8") as f:
        f.write("eml1,eml2\n")
        for exe in EXE_list:
            # node1_name = nodes.at[int(exe[0]), 'name']
            # node1_num = node1_name[5:]
            new_node_index = indexes[(indexes['oldIndex'] == int(exe[0]))].index.tolist()[0]
            node1_num = indexes.at[new_node_index, 'newIndex']
            # node2_name = nodes.at[int(exe[1]), 'name']
            # node2_num = node2_name[5:]
            new_node_index = indexes[(indexes['oldIndex'] == int(exe[1]))].index.tolist()[0]
            node2_num = indexes.at[new_node_index, 'newIndex']
            f.write(str(node1_num) + "," + str(node2_num) + "\n")

import numpy as np
def meta_path_to_matrix(meta_path_file):
    num = [[0 for i in range(0, 6975)] for j in range(0, 6975)]
    with open(meta_path_file, 'r') as f:
        cols = csv.DictReader(f)
        for col in cols:
            num[int(col["eml1"])-1][int(col["eml2"])-1]=1
            num[int(col["eml2"])-1][int(col["eml1"])-1] = 1
    for i in range(0,6975):
        num[i][i] = 1
    arr = np.array(num)
    return arr

def extract_label(label_file):
    num = [[0 for i in range(0, 2)] for j in range(0, 6975)]
    with open(label_file, 'r') as f:
        cols = csv.DictReader(f)
        for col in cols:
            if int(col["label"]) == 1:
                num[int(col["newIndex"])-1][0] = 1
            elif int(col["label"]) ==0:
                num[int(col["newIndex"])-1][1] =1
    arr = np.array(num)
    return arr

import random
def generate_features():
    features = [[0 for i in range(0, 8)] for j in range(0, 6975)]
    for i in range(0,6975):
        length=random.randint(1,8)
        for j in range(0,length):
            loc = random.randint(0,7)
            features[i][loc]=1
    features = np.array(features)
    return features

from scipy.io import savemat
def save_data(EDE_file,EIE_file,EXE_file,label_file,mat_file):
    shuffled_index = np.random.permutation(6975)
    split_index1 = int(6975 * 0.6)
    split_index2 = int(6975*0.8)
    train_index = shuffled_index[:split_index1]
    train_idx = np.array([train_index])
    val_index = shuffled_index[split_index1:split_index2]
    val_idx = np.array([val_index])
    test_index = shuffled_index[split_index2:]
    test_idx = np.array([test_index])
    label = extract_label(label_file)
    EDE = meta_path_to_matrix(EDE_file)
    EIE = meta_path_to_matrix(EIE_file)
    EXE = meta_path_to_matrix(EXE_file)
    features = generate_features()
    savemat(mat_file,{'EIE':EIE,'EDE':EDE,'EXE':EXE,'features':features,'label':label,'train_idx':train_idx,'val_idx':val_idx,'test_idx':test_idx})


if __name__ =="__main__":
    # benign_fraud_count("hunter_node.csv","first_subgraph_nodes1.txt")
    # node_type_count("hunter_node.csv")
    # show_connected_subgraphs("hunter_edge_index_only.csv")
    # merge_meta_path("hunter_edge_index_only.csv","meta_path_original.csv")
    # split_meta_path("hunter_node.csv","meta_path_original.csv","subgraph_index.csv")
    # meta_path_to_matrix("EDE_list.csv")
    # new_index_to_subgraph("hunter_edge_index_only.csv","subgraph_index.csv","hunter_node.csv")
    # extract_label("subgraph_index.csv")
    save_data("EDE_list.csv","EIE_list.csv","EXE_list.csv","subgraph_index.csv","SG_dataset.mat")
    # generate_features()