diff options
| author | HEATAO <[email protected]> | 2021-12-04 19:05:33 +0800 |
|---|---|---|
| committer | HEATAO <[email protected]> | 2021-12-04 19:05:33 +0800 |
| commit | a27ee32d641301e879d0f268e912b899a7e6249a (patch) | |
| tree | 906bdb4d7954917ea3584f4d760a82c103650881 | |
| parent | a5f1c8b920f7f2e7c1470c4b26e88b41a4a8e0db (diff) | |
add max pattern length && fix bug
| -rw-r--r-- | src/main/java/cn/ac/iie/Utils/JedisPoolUtils.java | 5 | ||||
| -rw-r--r-- | src/main/java/cn/ac/iie/Utils/SparkFactoryUtils.java | 2 | ||||
| -rw-r--r-- | src/main/java/cn/ac/iie/Utils/URLHandleUtils.java | 50 | ||||
| -rw-r--r-- | src/main/java/cn/ac/iie/config/Configurations.java | 7 | ||||
| -rw-r--r-- | src/main/java/cn/ac/iie/intervalStatics/FeatureGenerator.java | 92 | ||||
| -rw-r--r-- | src/main/java/cn/ac/iie/mining/MiningService.java | 5 | ||||
| -rw-r--r-- | src/main/java/cn/ac/iie/mining/prefixSpark.java | 2 | ||||
| -rw-r--r-- | src/main/resources/video_data.properties | 2 | ||||
| -rw-r--r-- | target/classes/META-INF/VideoPortalDetection.kotlin_module | bin | 16 -> 0 bytes | |||
| -rw-r--r-- | target/classes/video_data.properties | 2 |
10 files changed, 122 insertions, 45 deletions
diff --git a/src/main/java/cn/ac/iie/Utils/JedisPoolUtils.java b/src/main/java/cn/ac/iie/Utils/JedisPoolUtils.java index 46cb0d9..c748d00 100644 --- a/src/main/java/cn/ac/iie/Utils/JedisPoolUtils.java +++ b/src/main/java/cn/ac/iie/Utils/JedisPoolUtils.java @@ -71,10 +71,9 @@ public class JedisPoolUtils { public synchronized static Jedis getJedis() { try { if (jedisPool == null) { - JedisPool jedisPool = JedisPoolUtils.getJedisPoolInstance(); + jedisPool = JedisPoolUtils.getJedisPoolInstance(); } - Jedis resource = jedisPool.getResource(); - return resource; + return jedisPool.getResource(); } catch (Exception e) { e.printStackTrace(); return null; diff --git a/src/main/java/cn/ac/iie/Utils/SparkFactoryUtils.java b/src/main/java/cn/ac/iie/Utils/SparkFactoryUtils.java index 067fe4a..f081550 100644 --- a/src/main/java/cn/ac/iie/Utils/SparkFactoryUtils.java +++ b/src/main/java/cn/ac/iie/Utils/SparkFactoryUtils.java @@ -18,7 +18,7 @@ public class SparkFactoryUtils { ; spark = SparkSession .builder() - //.master("local[*]") + // .master("local[*]") .config(conf) .getOrCreate(); SparkContext sc = spark.sparkContext(); diff --git a/src/main/java/cn/ac/iie/Utils/URLHandleUtils.java b/src/main/java/cn/ac/iie/Utils/URLHandleUtils.java index 5cc00ea..b5b2588 100644 --- a/src/main/java/cn/ac/iie/Utils/URLHandleUtils.java +++ b/src/main/java/cn/ac/iie/Utils/URLHandleUtils.java @@ -13,11 +13,55 @@ public class URLHandleUtils { else return yspHost; } + /** + * Node:有时候会出现182.254.52.218/amobile.music.tc.qq.com这样的形式,需要特殊处理 + * @param url 完整的url地址 + * @return 作为数据挖掘的host + */ public static String getFirstURLPath(String url) { String[] yspHostList = url.split("/"); - if (yspHostList.length >= 1) - return yspHostList[0]; - else return url; + if (yspHostList.length >= 2) { + if (isCorrectIp(yspHostList[0]) && isDomainCrude(yspHostList[1])) + return yspHostList[1]; + else { + return yspHostList[0]; + } + } + return url; + } + + public static boolean isCorrectIp(String ipString) { + String ipRegex = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"; //IP地址的正则表达式 + //如果前三项判断都满足,就判断每段数字是否都位于0-255之间 + if (ipString.matches(ipRegex)) { + String[] ipArray = ipString.split("\\."); + for (int i = 0; i < ipArray.length; i++) { + int number = Integer.parseInt(ipArray[i]); + //4.判断每段数字是否都在0-255之间 + if (number <0||number>255) { + return false; + } + } + return true; + } + else { + return false; //如果与正则表达式不匹配,则返回false + } + } + + private static boolean isDomainCrude(String isString) { + String[] strSpl = isString.split("\\."); + if (strSpl.length == 1) + return false; + char[] chars = strSpl[strSpl.length - 1].toCharArray(); + boolean isPhontic; + for (char aChar : chars) { + isPhontic = (aChar >= 'a' && aChar <= 'z') || (aChar >= 'A' && aChar <= 'Z'); + if (!isPhontic) { + return false; + } + } + return true; } public static String handleSNI(String thisHost) { diff --git a/src/main/java/cn/ac/iie/config/Configurations.java b/src/main/java/cn/ac/iie/config/Configurations.java index 4294233..4c03c01 100644 --- a/src/main/java/cn/ac/iie/config/Configurations.java +++ b/src/main/java/cn/ac/iie/config/Configurations.java @@ -1,17 +1,20 @@ package cn.ac.iie.config; -import java.io.FileNotFoundException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.FileOutputStream; import java.io.IOException; import java.util.Properties; public class Configurations { private static Properties propCommon = new Properties(); + private static Logger logger = LoggerFactory.getLogger(Configurations.class); static { try { propCommon.load(Configurations.class.getClassLoader().getResourceAsStream("video_data.properties")); - System.out.println("配置文件video_data.properties加载成功"); + logger.warn("配置文件video_data.properties加载成功"); } catch (Exception e) { propCommon = null; System.err.println("配置加载失败"); diff --git a/src/main/java/cn/ac/iie/intervalStatics/FeatureGenerator.java b/src/main/java/cn/ac/iie/intervalStatics/FeatureGenerator.java index 5eeabb1..e263e97 100644 --- a/src/main/java/cn/ac/iie/intervalStatics/FeatureGenerator.java +++ b/src/main/java/cn/ac/iie/intervalStatics/FeatureGenerator.java @@ -23,6 +23,7 @@ import scala.collection.mutable.WrappedArray; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.*; +import static org.apache.spark.sql.functions.*; public class FeatureGenerator { static { @@ -39,12 +40,12 @@ public class FeatureGenerator { * 评估方式:Index avg 和 Prep pos=1 * 输出结果:每一个ysp host的向量彼此间隔,其下每一行的格式为[host, true/false, 特征向量] */ - public void featureExtracFromFile() { - try (InputStream is = this.getClass().getClassLoader().getResourceAsStream("testSet.txt"); - Writer writer = new OutputStreamWriter(new FileOutputStream("feature.txt"), "UTF-8"); + public void featureExtracFromFile(String fileName) { + try (InputStream is = this.getClass().getClassLoader().getResourceAsStream(fileName); + Writer writer = new OutputStreamWriter(new FileOutputStream("feature.txt"), StandardCharsets.UTF_8); SqlSession sqlSession = SqlSessionFactoryUtils.getSqlSessionFactory().openSession()) { if (is == null) { - System.out.println("文件不存在"); + logger.error(String.format("%s 不存在", fileName)); System.exit(0); } AVDataListMapper avListMapper = sqlSession.getMapper(AVDataListMapper.class); @@ -60,7 +61,7 @@ public class FeatureGenerator { // 先计算每一个候选集,然后判断当前host在哪个位置 // 用特征结果来表示redis的key,结尾加一个FeatExtrac后缀 - if (jedis != null && jedis.exists(yspHost) && !yspHost.startsWith("200.200.200.200")) { + if (jedis != null && !yspHost.startsWith("200.200.200.200")) { // 因为之前写入的redis的都是符合条件的,所以这里可以用缓存来判断 // 如果单独运行的话,这里其实需要进行一次 writer.write(yspHost + " " + refererHost + "\r\n"); @@ -70,17 +71,17 @@ public class FeatureGenerator { recallAndResort(yspHost, refererHost, writer, jedis, avListMapper, 1); } else { - logger.warn(String.format("%s在Cache中存在", yspHost)); + logger.warn(String.format("%s 在 Cache 中存在", yspHost)); String[] candSetArray = candSet.substring(1, candSet.length() - 1).split(", "); for (String eachHost: candSetArray) { String hostKey = eachHost + "_FeatExtrac"; String cacheLine = jedis.hget(yspHost, hostKey); - if (cacheLine == null) { - logger.error("candSet cache存在而某条feature cache不存在!"); + if (cacheLine == null && eachHost.length() > 0) { + logger.error(String.format("candSet cache 存在而 %s 不存在!", hostKey)); continue; } // 注意cacheLine的格式 - writer.write(eachHost + "," + eachHost.equals(refererHost) + "," + cacheLine); + writer.write(eachHost + " " + eachHost.equals(refererHost) + " " + cacheLine); } } } @@ -99,16 +100,16 @@ public class FeatureGenerator { * 把每一行写入训练数据中。这里为了防止host特别多,设置写入的训练数据必须在两个候选集中出现。 * 如果refer没有在两个候选集中出现,那么返回0,否则返回1 * @param writer + * @param allHisMap 全统计的Map,value是出现次数,注意allHisMap可能会非常大 * @param rule1Map 一项集的Map,value为次数 * @param rule2Map 规则2的Map,value也是在后面(非首部)出现了多少次 - * @param allHisMap 全统计的Map,value是出现次数,注意allHisMap可能会非常大 * @param candSet 当然是候选集啦~ */ - private int featureWrite(String yspHost, String referHost, Writer writer, Jedis jedis, - Map<String, Integer> allHisMap, Map<String, Long> prefixCntMap, - Map<String, Integer[]> rule1Map, Map<String, Integer[]> rule2Map, - Map<String, Integer> occHash, Map<String, Integer> disToYspMap, - Set<String> candSet, float candHostSum, float hisSum, float freqPatterns) throws IOException { + private void featureWrite(String yspHost, String referHost, Writer writer, Jedis jedis, + Map<String, Integer> allHisMap, Map<String, Long> prefixCntMap, + Map<String, Integer[]> rule1Map, Map<String, Integer[]> rule2Map, + Map<String, Integer> occHash, Map<String, Integer> disToYspMap, + Set<String> candSet, float candHostSum, float hisSum, float freqPatterns) throws IOException { jedis.hset(yspHost, "candSet", candSet.toString()); for (String thisHost : candSet) { if ((rule1Map.containsKey(thisHost) || allHisMap.containsKey(thisHost)) && thisHost.length() > 0) { @@ -138,9 +139,6 @@ public class FeatureGenerator { } } - if (rule1Map.containsKey(referHost) || allHisMap.containsKey(referHost)) - return 1; - else return 0; } /** @@ -158,7 +156,7 @@ public class FeatureGenerator { (int)(allHisMap.getOrDefault(o1, 0) + prefixCntMap.getOrDefault(o1, 0L) + rule1Map.getOrDefault(o1, new Integer[]{0, 0})[0] - rule2Map.getOrDefault(o1, new Integer[]{0, 0})[0]))); - jedis.hset(yspHost, "candSortedSet", candList.toString()); + jedis.hset(yspHost, "candiSortedSet", candList.toString()); for (String thisHost : candList) { writer.write(thisHost + "\r\n"); } @@ -211,7 +209,11 @@ public class FeatureGenerator { String likeUrl = '%' + yspHost + '%'; List<AVlog> aVlogs = avListMapper.getHistoryHost(likeUrl, Configurations.getIntProperty(0, "rawLimit")); float candHostSum = aVlogs.size(); - logger.warn(String.format("%s过去曾出现过%f次", yspHost, candHostSum)); + if (candHostSum < Configurations.getIntProperty(0, "leastHistoryNum")) { + logger.warn(String.format("%s 过去曾出现过 %f 次,不符合 Mining 条件", yspHost, candHostSum)); + return; + } + else logger.warn(String.format("%s 过去曾出现过 %f 次", yspHost, candHostSum)); // 下面的几个Hash表其实key是一样的,是否应该合并一下呢 // 统计历史总量的Hash表,用于召回 @@ -280,7 +282,7 @@ public class FeatureGenerator { int minMiNum = Configurations.getIntProperty(0, "leastHistoryNum"); if (historyCnt - overflowCols < minMiNum || data.size() < minMiNum) { - logger.warn("不满足mining条件"); + logger.warn("不满足 Mining 条件"); return; } @@ -296,12 +298,13 @@ public class FeatureGenerator { false, Metadata.empty()) }); Dataset<Row> sequenceDF = SparkFactoryUtils.getSparkSessionFactory().createDataFrame(data, schema); - sequenceDF.show(20, true); - //sequenceDF.show(50, false); - PrefixSpan prefixSpan = new PrefixSpan().setMinSupport(Configurations.getDoubleProperty(0, "prefixConfidence")).setMaxPatternLength(5); + // sequenceDF.show(20, true); + PrefixSpan prefixSpan = new PrefixSpan() + .setMinSupport(Configurations.getDoubleProperty(0, "prefixConfidence")) + .setMaxPatternLength(Configurations.getIntProperty(0, "prefixMaxPatternLength")); // Finding frequent sequential patterns Dataset<Row> sequence = prefixSpan.findFrequentSequentialPatterns(sequenceDF); - sequence.show(10, false); + // sequence.show(10, false); // 这里重写build candidate HashMap<String, Integer[]> restFirstMap = new HashMap<>(); @@ -310,11 +313,29 @@ public class FeatureGenerator { String candidateItem; HashMap<String, Long> prefixCntMap = new HashMap<>(); float freqPatterns = 0; - for(Iterator<Row> iter = sequence.toLocalIterator(); iter.hasNext();) { + // Iterator<Row> iter = sequence.toLocalIterator(); + Dataset<Row> sequenceSorted = sequence.sort(col("freq").desc()); + + // TODO: 该用take前n个进行处理,防止内存溢出 + // List<Row> out = sequenceSorted.takeAsList(100); + // for (Row row: out) { + // System.out.println(row.get(0)); + // System.out.println(row.getStruct(0)); + // System.out.println(row.apply(0)); + // System.out.println(row.apply(1)); // freq + // System.out.println(row.get(1)); + // System.out.println(row.toString()); + // } + // System.exit(1); + + Iterator<Row> iter = sequenceSorted.toLocalIterator(); + logger.warn("序列挖掘成功!"); + while(iter.hasNext()) { Row each = iter.next(); if (each.size() == 2) { freqPatterns += 1; try { + // 下面的警告不重要,看不顺眼就把泛型补全 WrappedArray rawSequence = (WrappedArray)each.get(0); WrappedArray firstItem = (WrappedArray)rawSequence.apply(0); if (rawSequence.size() == 1) { @@ -392,13 +413,16 @@ public class FeatureGenerator { return candSet; } - public void refreshCache() { + /** + * @param fields candSet candiSortedSet + */ + public void refreshCache(String... fields) { logger.warn("清除Cache..."); Jedis jedis = JedisPoolUtils.getJedis(); assert jedis != null; Set<String> cacheKeys = jedis.keys("*"); for (String key: cacheKeys) { - jedis.hdel(key, "candSet", "candSortedSet"); + jedis.hdel(key, fields); } JedisPoolUtils.returnResource(jedis); } @@ -416,15 +440,16 @@ public class FeatureGenerator { List<AVlog> yspSet = avListMapper.getUniqAVListForAll(); // 拿到经过处理后的url以此进行序列挖掘 + int cnt = 1; for(AVlog yspLog: yspSet) { String yspHost = yspLog.getUrl(); if (yspHost.length() == 0) continue; writer.write("-----------------------------------------------------------------------------------\r\n"); writer.write("*" + " " + yspHost + "\r\n"); - System.out.println(yspHost); + logger.warn(String.format("当前处理 %s,已处理 %d 条", yspHost, cnt)); assert jedis != null; - String candSet = jedis.hget(yspHost, "candSortedSet"); + String candSet = jedis.hget(yspHost, "candiSortedSet"); if (candSet == null) { recallAndResort(yspHost, "", writer, jedis, avListMapper, 2); } @@ -435,6 +460,7 @@ public class FeatureGenerator { } } writer.write("-----------------------------------------------------------------------------------\r\n"); + cnt += 1; } } catch (IOException e) { e.printStackTrace(); @@ -445,9 +471,9 @@ public class FeatureGenerator { public static void main(String[] args) { FeatureGenerator featureGenerator = new FeatureGenerator(); logger.warn("特征生成任务开始..."); - featureGenerator.refreshCache(); - featureGenerator.GetYspCandNoRef(); -// featureGenerator.featureExtracFromFile(); + // featureGenerator.refreshCache("candSet"); + // featureGenerator.GetYspCandNoRef(); + featureGenerator.featureExtracFromFile("testSetALL.txt"); } }
\ No newline at end of file diff --git a/src/main/java/cn/ac/iie/mining/MiningService.java b/src/main/java/cn/ac/iie/mining/MiningService.java index 496ea65..65ab6c6 100644 --- a/src/main/java/cn/ac/iie/mining/MiningService.java +++ b/src/main/java/cn/ac/iie/mining/MiningService.java @@ -288,7 +288,9 @@ public class MiningService { Dataset<Row> sequenceDF = SparkFactoryUtils.getSparkSessionFactory().createDataFrame(data, schema); sequenceDF.show(20, false); //sequenceDF.show(50, false); - PrefixSpan prefixSpan = new PrefixSpan().setMinSupport(Configurations.getDoubleProperty(0, "prefixConfidence")).setMaxPatternLength(5); + PrefixSpan prefixSpan = new PrefixSpan() + .setMinSupport(Configurations.getDoubleProperty(0, "prefixConfidence")) + .setMaxPatternLength(Configurations.getIntProperty(0, "prefixMaxPatternLength")); // Finding frequent sequential patterns Dataset<Row> sequence = prefixSpan.findFrequentSequentialPatterns(sequenceDF); @@ -363,7 +365,6 @@ public class MiningService { * 直接用一项集的话,结果和统计是没差的,其实入口更倾向于出现在多项集的首部,而不是一项集 * 规则1:入口更倾向于出现在多项集的首部 -> 统计candidate出现在多项集首部的次数 * 规则2:入口更不倾向于出现在无序集中 -> 统计candidate出现在不是首部的比例(这一点目前来看不一定是好的,因为很多大站很可能就在后面会出现呀~) - * TODO: 这里排序的实现需要改进,用排序序列的方法直接忽略了数值的关系,损失了部分特征,后续可以考虑用learning to rank的方法 * * @param candidateList 一项集 * @param restMap 多项集 diff --git a/src/main/java/cn/ac/iie/mining/prefixSpark.java b/src/main/java/cn/ac/iie/mining/prefixSpark.java index 330b6fd..0019559 100644 --- a/src/main/java/cn/ac/iie/mining/prefixSpark.java +++ b/src/main/java/cn/ac/iie/mining/prefixSpark.java @@ -53,7 +53,7 @@ public class prefixSpark { // Finding frequent sequential patterns Dataset<Row> sequence = prefixSpan.findFrequentSequentialPatterns(sequenceDF); - sequence.show(); + sequence.show(100, false); // $example off$ //List<String> listOne = sequence.as(Encoders.STRING()).collectAsList(); //System.out.println(listOne); diff --git a/src/main/resources/video_data.properties b/src/main/resources/video_data.properties index 93182af..66aabd9 100644 --- a/src/main/resources/video_data.properties +++ b/src/main/resources/video_data.properties @@ -12,6 +12,8 @@ intervalRight = 15 Spark.master=spark://192.168.10.9:7077 # prefix算法的置信度 prefixConfidence=0.3 +# MaxPatternLength +prefixMaxPatternLength=2 # 用于数据挖掘的历史时间,当前时间-最长间隔=最早使用的历史日志,注意这里的单位为天 historyTrafficInterval=180 # 候选集最大容量 diff --git a/target/classes/META-INF/VideoPortalDetection.kotlin_module b/target/classes/META-INF/VideoPortalDetection.kotlin_module Binary files differdeleted file mode 100644 index a49347a..0000000 --- a/target/classes/META-INF/VideoPortalDetection.kotlin_module +++ /dev/null diff --git a/target/classes/video_data.properties b/target/classes/video_data.properties index 93182af..66aabd9 100644 --- a/target/classes/video_data.properties +++ b/target/classes/video_data.properties @@ -12,6 +12,8 @@ intervalRight = 15 Spark.master=spark://192.168.10.9:7077 # prefix算法的置信度 prefixConfidence=0.3 +# MaxPatternLength +prefixMaxPatternLength=2 # 用于数据挖掘的历史时间,当前时间-最长间隔=最早使用的历史日志,注意这里的单位为天 historyTrafficInterval=180 # 候选集最大容量 |
