Initial commit

author: 尹姜谊 <[email protected]> 2024-01-12 18:12:32 +0800
committer: 尹姜谊 <[email protected]> 2024-01-12 18:12:32 +0800
commit: 4111d1fac7c21e701cb9be1c1365a5ffdaab94e4 (patch)
tree: 8c3f5dfc615da2575450cb67ada254c0f8de17ec /detection
26 files changed, 1209 insertions, 0 deletions
diff --git a/detection/__init__.py b/detection/__init__.py
new file mode 100644
index 0000000..0c8fce0
--- /dev/null
+++ b/detection/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2024/1/8 11:42
+# @author    : yinjinagyi
+# @File    : __init__.py.py
+# @Function:
diff --git a/detection/__pycache__/__init__.cpython-39.pyc b/detection/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..a39bcb4
--- /dev/null
+++ b/detection/__pycache__/__init__.cpython-39.pyc
diff --git a/detection/__pycache__/vpn_detector.cpython-39.pyc b/detection/__pycache__/vpn_detector.cpython-39.pyc
new file mode 100644
index 0000000..c491155
--- /dev/null
+++ b/detection/__pycache__/vpn_detector.cpython-39.pyc
diff --git a/detection/tool/Config.py b/detection/tool/Config.py
new file mode 100644
index 0000000..214c3c9
--- /dev/null
+++ b/detection/tool/Config.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2024/1/8 12:36
+# @author    : yinjinagyi
+# @File    : Config.py
+# @Function: read config.yaml file and manage all configurations
+
+import yaml
+import os
+import sys
+sys.path.append('..')
+from tool.Functions import get_project_path
+
+
+class Config:
+    def __init__(self):
+        # read config.yaml file
+        self.config = self.read_yaml(get_project_path() + '/config.yaml')
+
+
+    # read yaml file
+    def read_yaml(self, yaml_file):
+        try:
+            with open(yaml_file, 'r', encoding='utf-8') as f:
+                config = yaml.load(f.read(), Loader=yaml.FullLoader)
+                return config
+        except Exception as e:
+            print(e)
+            sys.exit()
+
+
+
+
+if __name__ == '__main__':
+    config = Config().config
+    print(config)
+    print(config['time_zone'])
+    print(config['sql']['cyberghost']['sql_tsg'])
diff --git a/detection/tool/Functions.py b/detection/tool/Functions.py
new file mode 100644
index 0000000..043be6b
--- /dev/null
+++ b/detection/tool/Functions.py
@@ -0,0 +1,495 @@
+'''
+Description:
+Author: chenxu
+Date: 2022-03-31 15:15:09
+LastEditTime: 2022-09-29 10:59:11
+LastEditors: yinjiangyi
+'''
+import datetime
+import os
+import re
+import shutil
+
+import pytz
+import yaml
+import struct
+import socket
+import ipaddress
+import numpy as np
+import pandas as pd
+
+from clickhouse_driver import Client
+
+
+
+'''
+Description: 判断文件是否存在
+Param : 文件路径
+Return: 
+'''
+
+
+def fileExists(readfilePath):
+    if os.path.exists(readfilePath):
+        return True
+    else:
+        return False
+
+
+'''
+Description: 读文件
+Param : 
+Return: 
+param {*} readfilePath
+'''
+
+
+def readTxt(readfilePath):
+    with open(readfilePath) as file:
+        lines = file.readlines()
+        listStr = list()
+        for line in lines:
+            listStr.append("".join(line.split()))
+
+    return listStr
+
+
+'''
+Description: 写文件
+Param : 
+Return: 
+param {*} writeFilePath
+param {*} content
+'''
+
+
+def write(writeFilePath, content):
+    with open(writeFilePath, 'a') as file:
+        for item in content:
+            file.write("".join(item))
+        file.write('\n')
+    try:
+        pass
+    except BaseException as e:
+        pass
+
+
+'''
+Description: 文件内容清空
+Param : 
+Return: 
+param {*} writeFilePath
+'''
+
+
+def clear(writeFilePath):
+    file = open(writeFilePath, 'w')
+    file.closed
+
+
+def clear_dir(dir_path, git_keep=False):
+    shutil.rmtree(dir_path)
+    os.mkdir(dir_path)
+    if git_keep:
+        open(dir_path + "/.gitkeep", 'a').close()
+
+
+'''
+    Description: 判断域名是否合法
+    Param : 域名
+    Return: 合法True,不合法False
+'''
+
+
+def is_valid_domain(domain):
+    pattern = re.compile(
+        r'^(([a-zA-Z]{1})|([a-zA-Z]{1}[a-zA-Z]{1})|'
+        r'([a-zA-Z]{1}[0-9]{1})|([0-9]{1}[a-zA-Z]{1})|'
+        r'([a-zA-Z0-9][-_.a-zA-Z0-9]{0,61}[a-zA-Z0-9]))\.'
+        r'([a-zA-Z]{2,13}|[a-zA-Z0-9-]{2,30}.[a-zA-Z]{2,3})$'
+    )
+    return True if pattern.match(domain) else False
+
+
+'''
+    Description: 判断域名中的非域名个数
+    Param : 域名
+    Return: 非域名个数比例
+'''
+
+
+def isNotDomain(hostList):
+    isNotdomainNum = 0
+    for item in hostList:
+        item = item.split(':')[0]
+        if is_valid_domain(item) == False:
+            isNotdomainNum = isNotdomainNum + 1
+    if len(hostList) > 0:
+        return isNotdomainNum / len(hostList)
+
+    return 0
+
+
+'''
+    Description: 判断域名中的www&>17&word>=3域名个数 
+    Param : 域名
+    Return: 非域名个数比例
+'''
+
+
+def wwwDomain(hostList):
+    pattern = re.compile(r'(www)\.([a-zA-Z]{5,25})\.(com)')
+    cnt = 0
+    for item in hostList:
+        if is_valid_domain(item) and pattern.match(item) and len(item) >= 17:
+            cnt = cnt + 1
+    if len(hostList) > 0:
+        return cnt / len(hostList)
+
+    return 0
+
+
+def readYaml(path):
+    file = open(path, encoding="utf-8")
+    data = yaml.safe_load(file)
+    file.close()
+    return data
+
+
+"""
+    将列表拆分为指定长度的多个列表
+    :param lists: 初始列表
+    :param cut_len: 每个列表的长度
+    :return: 一个二维数组 [[x,x],[x,x]]
+    """
+
+
+def cut_list(lists, cut_len):
+    res_data = []
+    if len(lists) > cut_len:
+        for i in range(int(len(lists) / cut_len)):
+            cut_a = lists[cut_len * i:cut_len * (i + 1)]
+            res_data.append(cut_a)
+
+        last_data = lists[int(len(lists) / cut_len) * cut_len:]
+        if last_data:
+            res_data.append(last_data)
+    else:
+        res_data.append(lists)
+
+    return res_data
+
+
+'''
+Description: 数据库查询
+Param : ip
+Return: ip地理位置
+'''
+
+
+def queryIpDatabase(reader, ip, type):
+    # reader = awdb.open_database(path)
+    (record, prefix_len) = reader.get_with_prefix_len(ip)
+    if type == 'isp':
+        return bytes.decode(record.get('owner'))
+    if type == 'country':
+        return bytes.decode(record.get('areacode'))
+
+
+'''
+Description: 厂商匹配查询
+Param : isp name
+Return: 匹配结果
+'''
+
+
+def ipReputation(providerStr):
+    providerList = ['IONOS SE', 'M247 Ltd', 'AltusHost B.V.', 'Packet Exchange Limited', 'Orion Network Limited',
+                    'DigitalOcean, LLC', 'Greenhost BV', 'UK-2 Limited', 'RouteLabel V.O.F.', 'InMotion Hosting, Inc.',
+                    'ONLINE S.A.S.', 'Linode, LLC', 'Hosting Services, Inc.', 'Performive LLC']
+    for item in providerList:
+        if item in providerStr or providerStr in item:
+            return 1
+    return 0
+
+
+'''
+Description: 国家匹配
+Param : country
+Return: 匹配结果
+'''
+
+
+def ipCountry(Country):
+    if Country == 'ET':
+        return 1
+    return 0
+
+
+'''
+Description: 查询sip是否属于目标ISP
+Param : 
+Return: 属于返回True,否则返回False
+param {str} ip_address
+'''
+
+
+def queryipBlock(data, ip):
+    intIp = socket.ntohl(struct.unpack("I", socket.inet_aton(str(ip)))[0])
+    for index, row in data.iterrows():
+        if intIp > row['maxip'] or intIp < row['minip']:
+            continue
+        if intIp > row['minip'] and intIp < row['maxip']:
+            return True
+    return False
+
+
+# '''
+# Description: 根据spur结果收集ISP名称
+# Param :
+# Return:
+# '''
+# def collectIspName():
+#     reader = awdb.open_database('developerKits/IP_city_single_WGS84.awdb')
+#     spurLabel = pd.read_csv("externalTest/data/spur.csv",names=['ip','label'])
+#     spurVPNip = list()
+#     for index,row in spurLabel.iterrows():
+#         if row['label']!='0':
+#             spurVPNip.append(queryIpDatabase(reader,row['ip'],'isp'))
+#     print(Counter(spurVPNip))
+
+
+'''
+Description: 文件去重
+Param : 
+Return: 
+'''
+
+
+def duplicateRemoval(readDir, writeDir):
+    lines_seen = set()
+    outfile = open(writeDir, "w")
+    f = open(readDir, "r")
+    for line in f:
+        if line not in lines_seen:
+            outfile.write(line)
+            lines_seen.add(line)
+    outfile.close()
+
+
+'''
+Description: 判断是否是内网地址
+Param : 
+Return: 
+param {*} ip
+'''
+
+
+def is_lan(ip):
+    try:
+        return ipaddress.ip_address(ip.strip()).is_private
+    except Exception as e:
+        return False
+
+
+def getAsnList(file_path):
+    '''
+    get asn list from file
+    :return:
+    '''
+    asn_list = []
+    with open(file_path) as file:
+        lines = file.readlines()
+        for line in lines:
+            list_list = line.split(',')
+            if len(list_list) >= 1:
+                asn_list.extend([str(i.strip()) for i in list_list[1:]])
+            else:
+                pass
+    return asn_list
+
+
+def getFeatureList(file_path):
+    features = []
+    with open(file_path) as file:
+        lines = file.readlines()
+        for line in lines:
+            features.append(line.strip())
+    return features
+
+
+def is_valid_ip(ip_str):
+    try:
+        ipaddress.ip_address(ip_str)
+        return True
+    except ValueError:
+        return False
+
+
+def find_invalid_ip(ip_list):
+    error_ip_list = []
+    for ip in ip_list:
+        if not is_valid_ip(ip):
+            error_ip_list.append(ip)
+    return error_ip_list
+
+
+def filter_files_by_time_range(path, start_day, end_day, suffix='.csv'):
+    # 定义一个空列表，用于保存符合条件的文件路径
+    result = []
+
+    start_day_time = datetime.datetime.strptime(start_day, '%Y-%m-%d').date()
+    end_day_time = datetime.datetime.strptime(end_day, '%Y-%m-%d').date()
+
+    # 遍历目录下所有文件，并按照创建时间进行过滤
+    for root, dirs, files in os.walk(path):
+        for file in files:
+            # 判断文件类型是否为csv文件
+            if len(suffix) > 0 and file.endswith(suffix):
+                filepath = os.path.join(root, file)
+                # 获取文件创建时间
+                # created_time = os.path.getctime(filepath)
+                created_time = os.path.getatime(filepath)
+                # 将创建时间转换为日期格式
+                created_date = datetime.datetime.fromtimestamp(created_time).date()
+                if created_date >= start_day_time and created_date < end_day_time:
+                    result.append(filepath)
+        for d in dirs:
+            # 对于每个子目录，递归调用该函数，并将结果列表合并
+            result.extend(filter_files_by_time_range(os.path.join(root, d), start_day, end_day))
+
+    return list(set(result))
+
+
+def filter_files_by_created_time(path, days, suffix='.csv'):
+    # 定义一个空列表，用于保存符合条件的文件路径
+    end_day = datetime.date.today() + datetime.timedelta(days=1)
+    start_day = datetime.date.today() - datetime.timedelta(days=days)
+
+    return filter_files_by_time_range(path, start_day.strftime('%Y-%m-%d'), end_day.strftime('%Y-%m-%d'))
+
+
+def delete_dir_by_create_time(path, days):
+    ds = list(os.walk(path))
+    delta = datetime.timedelta(days=days)
+    now = datetime.datetime.now()
+
+    for d in ds:
+        os.chdir(d[0])
+        if d[2]:
+            for x in d[2]:
+                ctime = datetime.datetime.fromtimestamp(os.path.getmtime(x))
+                if ctime < (now - delta):
+                    os.remove(x)
+    os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+
+class connectTest():
+    def __init__(self, config):
+        self.headTime = config['headTime']
+        self.tailTime = config['tailTime']
+        self.tableName = config['tableName']
+        self.timeZone = config['timeZone']
+        self.client = Client(user=config['username'], password=config['password'], host=config['host'],
+                             port=config['port'], database=config['database'])
+        self.dbname = config['database']
+
+    def dataTest(self):
+        """
+        :return: 0:no data 2:wrong configuration
+        """
+        # 测试数据库有无数据
+        try:
+            # testSQL = "select * from " + self.tableName + " where (toDateTime(common_recv_time) >= '" + str(
+            #     self.headTime) + "') and (toDateTime(common_recv_time) <='" + str(self.tailTime) + "') Limit 1"
+
+            testSQL = "select * from " + self.tableName + " where common_recv_time >= toDateTime('" \
+                      + str(self.headTime) + "', '" + self.timeZone + "') and common_recv_time < toDateTime('" \
+                      + str(self.tailTime) + "', '" + self.timeZone + "') limit 1 "
+
+            queryResult = self.client.execute(testSQL)
+
+            if len(queryResult) < 1:
+                # logger.error(datetime.datetime.now(tz=pytz.timezone(self.timeZone)).strftime(
+                #                        "%Y-%m-%d %H:%M:%S") + '-' + str(self.headTime) + '~' + str(
+                #                        self.tailTime) + ' dataTest: ' + str('No data in this time window'))
+                return 0
+            return 1
+        except:
+            return 2
+            # logger.error(datetime.datetime.now(tz=pytz.timezone(self.timeZone)).strftime(
+            #                        "%Y-%m-%d %H:%M:%S") + '-' + 'time: ' + str(self.headTime) + '~' + str(
+            #                        self.tailTime) + ' dataTest: ' + str(
+            #                        'please check database configuration in config.yaml'))
+
+
+def get_project_path():
+    path = os.path.join(os.getcwd())
+    return path.rsplit('vpn-thwarting', 1)[0] + 'vpn-thwarting'
+
+
+def cal_psi(actual, predict, bins=10):
+    """
+    Discription: 计算PSI值，并输出实际和预期占比分布曲线
+    :param actual: Array或series，代表真实数据，如训练集模型得分
+    :param predict: Array或series，代表预期数据，如测试集模型得分
+    :param bins: 分段数
+    :return:
+        psi: float，PSI值
+        psi_df:DataFrame
+    """
+    actual_min = actual.min()  # 实际中的最小概率
+    actual_max = actual.max()  # 实际中的最大概率
+    binlen = (actual_max - actual_min) / bins
+    cuts = [actual_min + i * binlen for i in range(1, bins)]  # 设定分组
+    cuts.insert(0, -float("inf"))
+    cuts.append(float("inf"))
+    actual_cuts = np.histogram(actual, bins=cuts)  # 将actual等宽分箱
+    predict_cuts = np.histogram(predict, bins=cuts)  # 将predict按actual的分组等宽分箱
+    actual_df = pd.DataFrame(actual_cuts[0], columns=['actual'])
+    predict_df = pd.DataFrame(predict_cuts[0], columns=['predict'])
+    psi_df = pd.merge(actual_df, predict_df, right_index=True, left_index=True)
+    psi_df['actual_rate'] = (psi_df['actual'] + 1) / psi_df['actual'].sum()  # 计算占比，分子加1，防止计算PSI时分子分母为0
+    psi_df['predict_rate'] = (psi_df['predict'] + 1) / psi_df['predict'].sum()
+    psi_df['psi'] = (psi_df['actual_rate'] - psi_df['predict_rate']) * np.log(
+        psi_df['actual_rate'] / psi_df['predict_rate'])
+    psi = psi_df['psi'].sum()
+    return psi, psi_df
+
+
+def get_file_line_count(file_path):
+    """
+    Discription: 获取文件行数
+    :param file_path: 文件路径
+    :return: 文件行数
+    """
+    count = 0
+    for index, line in enumerate(open(file_path, 'r', encoding='utf-8')):
+        count += 1
+    return count
+
+
+def check_internet(timeout=3, servername='www.baidu.com'):
+    """
+    Discription: check if the internet is connected by visit a server, timeout is 3 seconds
+    :return: True or False
+    """
+    global s
+    try:
+        socket.setdefaulttimeout(timeout)
+        host = socket.gethostbyname(servername)
+        s = socket.create_connection((host, 80), 2)
+        s.close()
+        return True
+    except Exception as e:
+        return False
+
+
+def get_project_path():
+    # 返回上上级目录
+    return os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+
+
+
+
+
diff --git a/detection/tool/KnowledgeBaseTool.py b/detection/tool/KnowledgeBaseTool.py
new file mode 100644
index 0000000..1b55b51
--- /dev/null
+++ b/detection/tool/KnowledgeBaseTool.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2023/6/12 16:09
+# @author    : yinjinagyi
+# @File    : KnowledgeBaseTool.py
+# @Function:
+
+import json
+import sys
+import time
+from warnings import simplefilter
+import requests
+from tool import Functions, LoggingTool
+
+logger = LoggingTool.Logger().getLogger()
+simplefilter(action='ignore', category=FutureWarning)
+
+
+class KnowledgeApi:
+    def __init__(self, config):
+        self.config = config
+        self.api_address = config['knowledgebase']['host']
+        self.api_username = config['knowledgebase']['kb_username']
+        self.api_pin = config['knowledgebase']['api_pin']
+        self.api_path = config['knowledgebase']['api_path']
+        self.retry_max = config['knowledgebase']['api_retry_times']
+        self.request_timeout = config['knowledgebase']['api_timeout']
+        # self.api_token = self.get_api_token()
+        self.api_token = config['knowledgebase']['api_token']
+
+    def get_api_token(self):
+        url = "http://" + self.api_address + "/sys/login"
+        data = {
+            "username": self.api_username,
+            "pin": self.api_pin
+        }
+        try:
+            resp = requests.post(url, data=json.dumps(data),
+                                 headers={'Content-Type': 'application/json'}, timeout=self.request_timeout)
+            return json.loads(resp.text).get('data').get('token')
+        except Exception as e:
+            logger.error(e)
+            sys.exit()
+
+    def get_library_id(self, library_name):
+        global resp
+        url = "http://" + self.api_address + "/v1/knowledgeBase/list?name=" + library_name
+        header = {
+            "Cn-Authorization": self.api_token
+        }
+        try:
+            resp = requests.get(url, headers=header, timeout=self.request_timeout)
+            return json.loads(resp.text).get('data').get('list')[0].get('knowledgeId')
+        except Exception as e:
+            logger.error(resp.text)
+            logger.error(e)
+            sys.exit()
+
+
+    def file_import(self, file_path, knowledge_id, action, description=''):
+        url = 'http://' + self.api_address + self.api_path
+        file = open(file_path, "rb")
+        file_object = {"file": file}
+
+        param = {
+            "knowledgeId": knowledge_id,
+            "action": action,
+            "description": description
+        }
+
+        header = {
+            "Cn-Authorization": self.api_token
+        }
+
+        is_succeed = 0
+        response_code = -1
+        for i in range(self.retry_max):
+            try:
+                resp = requests.post(url, files=file_object, params=param,
+                                     headers=header, timeout=self.request_timeout)
+                resp = json.loads(resp.text)
+                logger.info(resp)
+                response_code = resp.get('code')
+                if response_code == 200:
+                    is_succeed = 1
+                    break
+            except requests.RequestException as e:
+                time.sleep(5)
+                logger.error(e)
+                continue
+
+        if not is_succeed:
+            logger.error('Import Failed!')
+            sys.exit()
+        else:
+            logger.info('Import succeed. Response code {}.'.format(response_code))
+
+        file.close()
diff --git a/detection/tool/LoggingTool.py b/detection/tool/LoggingTool.py
new file mode 100644
index 0000000..f298964
--- /dev/null
+++ b/detection/tool/LoggingTool.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2023/3/18 15:36
+# @author    : yinjinagyi
+# @File    : LoggingTool.py
+# @Function:
+
+
+import yaml
+import logging.config
+import sys
+sys.path.append('..')
+from tool.Functions import get_project_path
+
+
+
+
+class Logger:
+    LOGGER_NAME = 'runLogger'
+
+    def getLogger(self, loggerName=LOGGER_NAME):
+        logging.config.dictConfig(load_config())
+        logger = logging.getLogger(loggerName)
+        return logger
+
+
+def load_config():
+    project_path = get_project_path()
+    path = project_path + '/logging.yaml'
+    with open(path, 'r') as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+    config['handlers']['console_file']['filename'] = config['handlers']['console_file']['filename']
+    return config
diff --git a/detection/tool/MariadbTool.py b/detection/tool/MariadbTool.py
new file mode 100644
index 0000000..7a839cd
--- /dev/null
+++ b/detection/tool/MariadbTool.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2023/7/7 10:38
+# @author    : yinjinagyi
+# @File    : MariadbTool.py
+# @Function:
+
+import pymysql
+
+
+class MariadbUtil:
+
+    def __init__(self, host, port, user, password, database):
+        self.db = pymysql.connect(host=host,
+                                  port=port,
+                                  user=user,
+                                  password=password,
+                                  database=database)
+        self.cursor = self.db.cursor()
+
+    def query_sql(self, sql):
+        """查询多条数据"""
+        list_result = ()
+        try:
+            self.cursor.execute(sql)
+            list_result = self.cursor.fetchall()
+        except Exception as e:
+            print(e)
+        return list_result
+
+    def insert(self, sql):
+        """新增数据"""
+        return self.__edit(sql)
+
+    def update(self, sql):
+        """更新数据"""
+        return self.__edit(sql)
+
+    def delete(self, sql):
+        """删除数据"""
+        return self.__edit(sql)
+
+    def __edit(self, sql):
+        count = 0
+        try:
+            count = self.cursor.execute(sql)
+            self.db.commit()
+        except Exception as e:
+            print(e)
+        return count
+
+    def close(self):
+        self.cursor.close()
+
diff --git a/detection/tool/ProgramInit.py b/detection/tool/ProgramInit.py
new file mode 100644
index 0000000..a6b19b1
--- /dev/null
+++ b/detection/tool/ProgramInit.py
@@ -0,0 +1,24 @@
+'''
+Description: 对文件进行初始化
+Author: chenxu
+Date: 2022-09-20 10:53:28
+LastEditTime: 2022-11-02 15:01:07
+LastEditors:
+'''
+import Functions
+import os
+
+def dataFilerClear():
+    # Clear detection1 data
+    # print(os.path.join(os.getcwd()))
+    toolFunction.clear('PsiphonServerIP.txt')
+    toolFunction.clear_dir('logs/', git_keep=True)
+    toolFunction.clear_dir('data/feature', git_keep=True)
+    toolFunction.clear_dir('data/result', git_keep=True)
+    toolFunction.clear_dir('data/model/pic')
+
+def modelClear():
+    os.remove("data/model/originModel.model")
+    os.remove('data/model/RFmodel.model')
+
+dataFilerClear()
diff --git a/detection/tool/ResultEvaluation.py b/detection/tool/ResultEvaluation.py
new file mode 100644
index 0000000..6ee58df
--- /dev/null
+++ b/detection/tool/ResultEvaluation.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2023/3/27 19:34
+# @author    : yinjinagyi
+# @File    : ResultEvaluation.py
+# @Function:
+
+import os
+import pandas as pd
+
+from codev.tool import toolFunction
+
+
+class LabelScraper:
+
+    def __init__(self, start_time, end_time):
+        self.detected_df = self.read_result()
+
+
+    def read_result(self):
+        output_ip_list = []
+        files = toolFunction.filter_files_by_created_time('data/result', '2023-01-01', '2023-03-30')
+        for file in files:
+            output_ip_list.extend(list(pd.read_csv(file, names=['ip', 'pred_y', 'score', 'time'])['ip']))
+
+        result_df = pd.DataFrame(output_ip_list, columns=['ip', 'pred_y', 'score', 'time'])
+
+        return result_df
diff --git a/detection/tool/__init__.py b/detection/tool/__init__.py
new file mode 100644
index 0000000..273b330
--- /dev/null
+++ b/detection/tool/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2024/1/12 17:29
+# @author    : yinjinagyi
+# @File    : __init__.py.py
+# @Function:
diff --git a/detection/tool/__pycache__/Config.cpython-39.pyc b/detection/tool/__pycache__/Config.cpython-39.pyc
new file mode 100644
index 0000000..f1f042c
--- /dev/null
+++ b/detection/tool/__pycache__/Config.cpython-39.pyc
diff --git a/detection/tool/__pycache__/Functions.cpython-39.pyc b/detection/tool/__pycache__/Functions.cpython-39.pyc
new file mode 100644
index 0000000..1375d38
--- /dev/null
+++ b/detection/tool/__pycache__/Functions.cpython-39.pyc
diff --git a/detection/tool/__pycache__/KnowledgeBaseTool.cpython-39.pyc b/detection/tool/__pycache__/KnowledgeBaseTool.cpython-39.pyc
new file mode 100644
index 0000000..11eda10
--- /dev/null
+++ b/detection/tool/__pycache__/KnowledgeBaseTool.cpython-39.pyc
diff --git a/detection/tool/__pycache__/LoggingTool.cpython-39.pyc b/detection/tool/__pycache__/LoggingTool.cpython-39.pyc
new file mode 100644
index 0000000..2527566
--- /dev/null
+++ b/detection/tool/__pycache__/LoggingTool.cpython-39.pyc
diff --git a/detection/tool/__pycache__/MariadbTool.cpython-39.pyc b/detection/tool/__pycache__/MariadbTool.cpython-39.pyc
new file mode 100644
index 0000000..5cfa28f
--- /dev/null
+++ b/detection/tool/__pycache__/MariadbTool.cpython-39.pyc
diff --git a/detection/tool/__pycache__/__init__.cpython-39.pyc b/detection/tool/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..84ff087
--- /dev/null
+++ b/detection/tool/__pycache__/__init__.cpython-39.pyc
diff --git a/detection/vpn_detector.py b/detection/vpn_detector.py
new file mode 100644
index 0000000..c627aea
--- /dev/null
+++ b/detection/vpn_detector.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2024/1/8 13:51
+# @author    : yinjinagyi
+# @File    : vpn_detector.py
+# @Function: This class is the parent class of various vpn classes in file vpnservices
+import argparse
+import datetime
+import socket
+
+import sys
+import os
+sys.path.append('..')
+
+import pandas as pd
+from tool.LoggingTool import Logger
+from clickhouse_driver import Client
+from tool.Config import Config
+from tool.KnowledgeBaseTool import KnowledgeApi
+import concurrent.futures
+
+
+class VpnDetector:
+    """
+    This class is the parent class of various vpn classes in file vpnservices
+    Functions: Load config, query data from clickhouse database, sava results, write data to mariadb database
+    """
+    def __init__(self, start_time, end_time):
+        self.config = self.load_config()
+        self.output_file_path = self.config['common']['output_path']
+        self.logger = Logger().getLogger()
+        self.kb = KnowledgeApi(self.config)
+        self.start_time = start_time
+        self.end_time = end_time
+        self.time_zone = self.config['common']['time_zone']
+        self.dbname = self.config['clickhouse']['db_name']
+        self.table_name = self.config['clickhouse']['table_name']
+
+        self.client = Client(user=self.config['clickhouse']['username'], password=self.config['clickhouse']['password'],
+                             host=self.config['clickhouse']['host'],
+                             port=self.config['clickhouse']['port'], database=self.dbname)
+
+    def load_config(self):
+        """
+        Load config
+        :return: config
+        """
+        config = Config().config
+        return config
+
+    def save_to_knowledgebase(self, object_list, object_type, vpn_service_name, plugin_id, plugin_name, output_filename, confidence='suspected'):
+        """
+        Write data to local file and knowledge base
+        :param object_type:  ip or domain
+        :param object_list: list of ip or domain
+        :param vpn_service_name: nordvpn, cyberghostvpn, etc.
+        :param plugin_id: plugin id
+        :param plugin_name: plugin name
+        :param output_filename: filename
+        :param confidence: 3 kinds of confidence level, confirmed, suspect, tentative
+        :return:
+        """
+        if object_type == 'ip':
+            library_name = self.config['knowledgebase']['ip_library_name']
+        else:
+            library_name = self.config['knowledgebase']['domain_library_name']
+        knowledge_id = self.kb.get_library_id(library_name)
+
+        # convert result data into required format https://docs.geedge.net/pages/viewpage.action?pageId=104760257
+        result_df = pd.DataFrame()
+        if object_type == 'ip':
+            result_df['ip1'] = object_list
+            result_df['ip2'] = object_list
+            result_df.insert(0, 'addr_format', 'Single')
+        if object_type == 'domain':
+            result_df['domain'] = object_list
+
+        result_df['plugin_id'] = plugin_id
+        result_df['plugin_name'] = plugin_name
+        result_df['vpn_service_name'] = vpn_service_name
+        result_df['method'] = 'passive_ml'
+        result_df['confidence'] = confidence
+        result_df['is_valid'] = 1
+
+        # result save
+        if len(result_df) > 1:
+            self.logger.info('Start to update data to knowledgebase')
+            result_path = os.path.join('data', plugin_name)
+            if not os.path.exists(result_path):
+                os.makedirs(result_path)
+            result_file = os.path.join(result_path, output_filename)
+            result_df.to_csv(result_file, index=False)
+
+            # update to knowledgebase
+            knowledge_api = KnowledgeApi(self.config)
+            self.logger.info('[Updating knowledgebase]- {} num:{}'.format(object_type, len(object_list)))
+            description_str = "Update {} record(s).".format(len(object_list))
+            knowledge_api.file_import(result_file, knowledge_id, 'update', description_str)
+
+
+
+    def get_resolved_addr(self, server_name):
+        """
+        Get ipvanish server ip list from server name list
+        :param server_name: server name list
+        :return: ip addr list
+        """
+        resolved_addr = None
+        try:
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                resolved_addr = socket.gethostbyname_ex(server_name)[2]
+        except Exception as e:
+            # self.logger.error("Resolve failed. {}: {} ".format(server_name, e))
+            pass
+
+        return server_name, resolved_addr
+
+
+
+    def resolve_dns_for_domain_list(self, domain_list, max_workers=100):
+        """
+        Resolve domain list to ip list
+        :param domain_list:
+        :param max_workers:
+        :return:
+        """
+        results = []
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = [executor.submit(self.get_resolved_addr, domain) for domain in domain_list]
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    server_name, resolved_addr = future.result()
+                    if resolved_addr is not None:
+                        results.extend(resolved_addr)
+                except Exception as e:
+                    self.logger.error(e)
+
+        return results
+
+
+# 入口函数定义
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='VPN detection')
+    parser.add_argument('-p', '--plugin', type=str, help='plugin name')
+    parser.add_argument('-m', '--mode', type=str, default='recent', help='recent or fixed')
+    parser.add_argument('-r', '--recent_interval', type=str, default='1h', help='recent time')
+    parser.add_argument('-s', '--start', type=str, default='', help='start time')
+    parser.add_argument('-e', '--end', type=str, default='', help='end time')
+
+    args = parser.parse_args()
+    plugin_name = args.plugin
+    mode = args.mode
+    start_time = args.start
+    end_time = args.end
+
+    # parameter verification
+    if mode == 'recent':
+        if start_time != '' or end_time != '':
+            print('Please input correct time format')
+            exit()
+        recent_interval = args.recent_interval
+        if recent_interval[-1] == 'h':
+            recent_interval = int(recent_interval[:-1])
+        elif recent_interval[-1] == 'd':
+            recent_interval = int(recent_interval[:-1]) * 24
+        else:
+            print('Please input correct recent interval')
+            exit()
+        # 根据当前时间向前取整小时
+        end_time = datetime.datetime.now().strftime("%Y-%m-%d %H:00:00")
+        start_time = (datetime.datetime.now() - datetime.timedelta(hours=recent_interval)).strftime("%Y-%m-%d %H:00:00")
+    elif mode == 'fixed':
+        if start_time == '' or end_time == '':
+            print('Please input correct time format')
+            exit()
+
+    else:
+        print('Please input correct time mode')
+        exit()
+
+
+    detector = None
+    if plugin_name == 'hotspotvpn_serverip':
+        from vpnservices.hotspotvpn_serverip import HotspotvpnServerip
+        detector = HotspotvpnServerip(start_time, end_time)
+    elif plugin_name == 'ipvanishvpn_servername':
+        from vpnservices.ipvanishvpn_servername import IpvanishvpnServername
+        detector = IpvanishvpnServername(start_time, end_time)
+    elif plugin_name == 'ipvanishvpn_serverip':
+        from vpnservices.ipvanishvpn_serverip import IpvanishvpnServerip
+        detector = IpvanishvpnServerip(start_time, end_time)
+
+    else:
+        print('Please input correct plugin name')
+        exit()
+
+    result_list = detector.find_server()
+    if len(result_list) > 0:
+        detector.save_to_knowledgebase(result_list, detector.object_type,
+                                   detector.vpn_service_name, detector.plugin_id,
+                                   detector.plugin_name, detector.output_file_name, detector.confidence)
+
+
+
diff --git a/detection/vpnservices/__init__.py b/detection/vpnservices/__init__.py
new file mode 100644
index 0000000..7b4a106
--- /dev/null
+++ b/detection/vpnservices/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2024/1/8 11:48
+# @author    : yinjinagyi
+# @File    : __init__.py.py
+# @Function:
diff --git a/detection/vpnservices/__pycache__/__init__.cpython-39.pyc b/detection/vpnservices/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..2a33991
--- /dev/null
+++ b/detection/vpnservices/__pycache__/__init__.cpython-39.pyc
diff --git a/detection/vpnservices/__pycache__/hotspotvpn_serverip.cpython-39.pyc b/detection/vpnservices/__pycache__/hotspotvpn_serverip.cpython-39.pyc
new file mode 100644
index 0000000..bc2413a
--- /dev/null
+++ b/detection/vpnservices/__pycache__/hotspotvpn_serverip.cpython-39.pyc
diff --git a/detection/vpnservices/__pycache__/ipvanishvpn_serverip.cpython-39.pyc b/detection/vpnservices/__pycache__/ipvanishvpn_serverip.cpython-39.pyc
new file mode 100644
index 0000000..5e3a0e2
--- /dev/null
+++ b/detection/vpnservices/__pycache__/ipvanishvpn_serverip.cpython-39.pyc
diff --git a/detection/vpnservices/__pycache__/ipvanishvpn_servername.cpython-39.pyc b/detection/vpnservices/__pycache__/ipvanishvpn_servername.cpython-39.pyc
new file mode 100644
index 0000000..b6603a4
--- /dev/null
+++ b/detection/vpnservices/__pycache__/ipvanishvpn_servername.cpython-39.pyc
diff --git a/detection/vpnservices/hotspotvpn_serverip.py b/detection/vpnservices/hotspotvpn_serverip.py
new file mode 100644
index 0000000..39aa875
--- /dev/null
+++ b/detection/vpnservices/hotspotvpn_serverip.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2024/1/11 15:45
+# @author    : yinjinagyi
+# @File    : hotspotvpn_serverip.py
+# @Function:
+
+from vpn_detector import VpnDetector
+import pandas as pd
+
+
+class HotspotvpnServerip(VpnDetector):
+    """
+
+    This class is used to detect hotspotvpn server ip
+    """
+
+    def __init__(self, start_time, end_time):
+        super().__init__(start_time, end_time)
+        self.plugin_config = self.load_config()['hotspotvpn_serverip']
+        self.plugin_id = self.plugin_config['plugin_id']
+        self.plugin_name = self.plugin_config['plugin_name']
+        self.object_type = self.plugin_config['object_type']
+        self.vpn_service_name = self.plugin_config['vpn_service_name']
+        self.confidence = self.plugin_config['confidence']
+        self.output_file_name = self.plugin_name + '_' + str(self.start_time).replace(' ', '_').replace(':', '')[:13] + '.csv'
+        self.start_time = start_time
+        self.end_time = end_time
+
+        self.sql = self.plugin_config['sql']
+        self.masquerede_domains = ["'"+i.strip()+"'" for i in self.plugin_config['domains'].split(',')]
+
+    def find_server(self):
+        """
+        Get hotspotvpn server ip from clickhouse database
+        :return: hotspotvpn server ip list
+        """
+        self.logger.info('Start to query hotspotvpn server ip from clickhouse database')
+
+        # construct query sql
+        TIME_FILTER_PATTERN = "(common_recv_time > toDateTime('{$start_time}', '{$time_zone}')) AND(common_recv_time <= toDateTime('{$end_time}', '{$time_zone}'))"
+        time_filter = TIME_FILTER_PATTERN.replace("{$start_time}", str(self.start_time)).replace("{$end_time}", str(
+            self.end_time)).replace("{$time_zone}", self.time_zone)
+        self.sql = self.sql.replace("{$db_name}", self.dbname).replace("{$table_name}", self.table_name)
+        self.sql = self.sql.replace("{$time_filter}", time_filter)
+        self.sql = self.sql.replace("{$domain_list}", ','.join(self.masquerede_domains))
+
+        # self.logger.info("Sql for {}: {}".format(self.plugin_name, self.sql))
+
+        # query data from clickhouse database
+        try:
+            hotspotvpn_serverip_df = pd.DataFrame(self.client.execute(self.sql))
+        finally:
+            self.client.disconnect()
+
+        if hotspotvpn_serverip_df.empty:
+            self.logger.info('No hotspotvpn server ip found from clickhouse database')
+            return []
+        hotspotvpn_serverip_list = hotspotvpn_serverip_df[0].drop_duplicates().tolist()
+        self.logger.info('Query hotspotvpn server ip from clickhouse database successfully. {} items found'
+                         .format(len(hotspotvpn_serverip_list)))
+
+        return hotspotvpn_serverip_list
diff --git a/detection/vpnservices/ipvanishvpn_serverip.py b/detection/vpnservices/ipvanishvpn_serverip.py
new file mode 100644
index 0000000..4d6daaa
--- /dev/null
+++ b/detection/vpnservices/ipvanishvpn_serverip.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2024/1/12 10:00
+# @author    : yinjinagyi
+# @File    : ipvanishvpn_serverip.py
+# @Function:
+
+import socket
+from vpn_detector import VpnDetector
+from tool.Functions import check_internet
+from tool.MariadbTool import MariadbUtil
+
+
+class IpvanishvpnServerip(VpnDetector):
+    """
+    This class is used to detect ipvanishvpn server ip
+    """
+
+    def __init__(self, start_time, end_time):
+        super().__init__(start_time, end_time)
+        self.plugin_config = self.load_config()['ipvanishvpn_serverip']
+        self.plugin_id = self.plugin_config['plugin_id']
+        self.plugin_name = self.plugin_config['plugin_name']
+        self.object_type = self.plugin_config['object_type']
+        self.vpn_service_name = self.plugin_config['vpn_service_name']
+        self.confidence = self.plugin_config['confidence']
+        self.output_file_name = self.plugin_name + '_' + str(self.start_time).replace(' ', '_').replace(':', '')[:13] + '.csv'
+        self.start_time = start_time
+        self.end_time = end_time
+
+        self.kb_sql = self.plugin_config['kb_sql']
+        self.kb_dbname = self.config['knowledgebase']['db_name']
+        self.kb_table_name = self.config['knowledgebase']['domain_library_name']
+
+        self.mariadb = MariadbUtil(self.config['mariadb']['host'], self.config['mariadb']['port'],
+                                   self.config['mariadb']['user'], str(self.config['mariadb']['pswd']),
+                                   self.config['mariadb']['db_name'])
+        self.mariadb_dbname = self.config['mariadb']['db_name']
+        self.mariadb_ip_tb_name = self.config['mariadb']['ip_table_name']
+        self.mariadb_domain_tb_name = self.config['mariadb']['domain_table_name']
+
+
+    def find_more_servernames(self, server_name_list):
+        """
+        Find more server name from observed ipvanish server name list
+        :return: server name list
+        """
+        prefix_list = []
+        expanded_server_names = []
+
+        for server_name in server_name_list:
+            domain = server_name.strip()
+            domain_prefix = domain[:5]
+            prefix_list.append(domain_prefix)
+
+        prefix_list = set(prefix_list)
+
+        for domain_prefix in prefix_list:
+            domain_list = [f"{domain_prefix}{str(index).zfill(2)}.vpn.ipvanish.com" for index in range(100)]
+            expanded_server_names.extend(domain_list)
+
+        return expanded_server_names
+
+
+    def find_server(self):
+        """
+        Get ipvanishvpn server ip from clickhouse database
+        :return: ipvanishvpn server ip list
+        """
+        self.kb_sql = self.kb_sql.replace("{$mariadb_dbname}", self.mariadb_dbname).replace("{$mariadb_domain_tablename}", self.mariadb_domain_tb_name)
+
+        ipvanish_servername_list = []
+        resolved_ip_list = []
+        try:
+            query_result = self.mariadb.query_sql(self.kb_sql)
+        finally:
+            self.mariadb.close()
+
+        if query_result:
+            ipvanish_servername_list = [i[0] for i in query_result]
+
+        # 判断是否能够访问外网，如果能够访问外网，则从外网获取ipvanish_servername_list的域名解析地址
+        if check_internet():
+            ipvanish_servername_list = self.find_more_servernames(ipvanish_servername_list)
+            resolved_ip_list = self.resolve_dns_for_domain_list(ipvanish_servername_list)
+
+
+        return resolved_ip_list
+
+
+
diff --git a/detection/vpnservices/ipvanishvpn_servername.py b/detection/vpnservices/ipvanishvpn_servername.py
new file mode 100644
index 0000000..3d82074
--- /dev/null
+++ b/detection/vpnservices/ipvanishvpn_servername.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2024/1/11 15:45
+# @author    : yinjinagyi
+# @File    : ipvanishvpn_servername.py
+# @Function:
+
+import sys
+sys.path.append('..')
+from vpn_detector import VpnDetector
+import pandas as pd
+
+
+class IpvanishvpnServername(VpnDetector):
+    """
+
+    This class is used to detect ipvanish server name
+    """
+
+    def __init__(self, start_time, end_time):
+        super().__init__(start_time, end_time)
+        self.plugin_config = self.load_config()['ipvanishvpn_servername']
+        self.plugin_id = self.plugin_config['plugin_id']
+        self.plugin_name = self.plugin_config['plugin_name']
+        self.object_type = self.plugin_config['object_type']
+        self.vpn_service_name = self.plugin_config['vpn_service_name']
+        self.confidence = self.plugin_config['confidence']
+        self.output_file_name = self.plugin_name + '_' + str(self.start_time).replace(' ', '_').replace(':', '')[:13] + '.csv'
+        self.start_time = start_time
+        self.end_time = end_time
+
+        self.sql = self.plugin_config['sql']
+
+    def find_server(self):
+        """
+        Get ipvanishvpn server name from clickhouse database
+        :return: ipvanishvpn server name list
+        """
+        self.logger.info('Start to query ipvanishvpn server name from session record')
+
+        # construct query sql
+        TIME_FILTER_PATTERN = "(common_recv_time > toDateTime('{$start_time}', '{$time_zone}')) AND(common_recv_time <= toDateTime('{$end_time}', '{$time_zone}'))"
+        time_filter = TIME_FILTER_PATTERN.replace("{$start_time}", str(self.start_time)).replace("{$end_time}", str(
+            self.end_time)).replace("{$time_zone}", self.time_zone)
+        self.sql = self.sql.replace("{$db_name}", self.dbname).replace("{$table_name}", self.table_name)
+        self.sql = self.sql.replace("{$time_filter}", time_filter)
+        self.logger.info("Sql for {}: {}".format(self.plugin_name, self.sql))
+
+        # query data from clickhouse database
+        try:
+            ipvanishvpn_servername_df = pd.DataFrame(self.client.execute(self.sql))
+        finally:
+            self.client.disconnect()
+
+        if ipvanishvpn_servername_df.empty:
+            self.logger.info('No ipvanishvpn server ip found from clickhouse database')
+            return []
+        ipvanishvpn_servername_list = ipvanishvpn_servername_df[0].drop_duplicates().tolist()
+        self.logger.info('Query ipvanishvpn server ip from clickhouse database successfully. {} items found'
+                         .format(len(ipvanishvpn_servername_list)))
+
+        return ipvanishvpn_servername_list
author	尹姜谊 <[email protected]>	2024-01-12 18:12:32 +0800
committer	尹姜谊 <[email protected]>	2024-01-12 18:12:32 +0800
commit	4111d1fac7c21e701cb9be1c1365a5ffdaab94e4 (patch)
tree	8c3f5dfc615da2575450cb67ada254c0f8de17ec /detection