summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHEATAO <[email protected]>2021-12-04 19:05:33 +0800
committerHEATAO <[email protected]>2021-12-04 19:05:33 +0800
commita27ee32d641301e879d0f268e912b899a7e6249a (patch)
tree906bdb4d7954917ea3584f4d760a82c103650881
parenta5f1c8b920f7f2e7c1470c4b26e88b41a4a8e0db (diff)
add max pattern length && fix bug
-rw-r--r--src/main/java/cn/ac/iie/Utils/JedisPoolUtils.java5
-rw-r--r--src/main/java/cn/ac/iie/Utils/SparkFactoryUtils.java2
-rw-r--r--src/main/java/cn/ac/iie/Utils/URLHandleUtils.java50
-rw-r--r--src/main/java/cn/ac/iie/config/Configurations.java7
-rw-r--r--src/main/java/cn/ac/iie/intervalStatics/FeatureGenerator.java92
-rw-r--r--src/main/java/cn/ac/iie/mining/MiningService.java5
-rw-r--r--src/main/java/cn/ac/iie/mining/prefixSpark.java2
-rw-r--r--src/main/resources/video_data.properties2
-rw-r--r--target/classes/META-INF/VideoPortalDetection.kotlin_modulebin16 -> 0 bytes
-rw-r--r--target/classes/video_data.properties2
10 files changed, 122 insertions, 45 deletions
diff --git a/src/main/java/cn/ac/iie/Utils/JedisPoolUtils.java b/src/main/java/cn/ac/iie/Utils/JedisPoolUtils.java
index 46cb0d9..c748d00 100644
--- a/src/main/java/cn/ac/iie/Utils/JedisPoolUtils.java
+++ b/src/main/java/cn/ac/iie/Utils/JedisPoolUtils.java
@@ -71,10 +71,9 @@ public class JedisPoolUtils {
public synchronized static Jedis getJedis() {
try {
if (jedisPool == null) {
- JedisPool jedisPool = JedisPoolUtils.getJedisPoolInstance();
+ jedisPool = JedisPoolUtils.getJedisPoolInstance();
}
- Jedis resource = jedisPool.getResource();
- return resource;
+ return jedisPool.getResource();
} catch (Exception e) {
e.printStackTrace();
return null;
diff --git a/src/main/java/cn/ac/iie/Utils/SparkFactoryUtils.java b/src/main/java/cn/ac/iie/Utils/SparkFactoryUtils.java
index 067fe4a..f081550 100644
--- a/src/main/java/cn/ac/iie/Utils/SparkFactoryUtils.java
+++ b/src/main/java/cn/ac/iie/Utils/SparkFactoryUtils.java
@@ -18,7 +18,7 @@ public class SparkFactoryUtils {
;
spark = SparkSession
.builder()
- //.master("local[*]")
+ // .master("local[*]")
.config(conf)
.getOrCreate();
SparkContext sc = spark.sparkContext();
diff --git a/src/main/java/cn/ac/iie/Utils/URLHandleUtils.java b/src/main/java/cn/ac/iie/Utils/URLHandleUtils.java
index 5cc00ea..b5b2588 100644
--- a/src/main/java/cn/ac/iie/Utils/URLHandleUtils.java
+++ b/src/main/java/cn/ac/iie/Utils/URLHandleUtils.java
@@ -13,11 +13,55 @@ public class URLHandleUtils {
else return yspHost;
}
+ /**
+ * Node:有时候会出现182.254.52.218/amobile.music.tc.qq.com这样的形式,需要特殊处理
+ * @param url 完整的url地址
+ * @return 作为数据挖掘的host
+ */
public static String getFirstURLPath(String url) {
String[] yspHostList = url.split("/");
- if (yspHostList.length >= 1)
- return yspHostList[0];
- else return url;
+ if (yspHostList.length >= 2) {
+ if (isCorrectIp(yspHostList[0]) && isDomainCrude(yspHostList[1]))
+ return yspHostList[1];
+ else {
+ return yspHostList[0];
+ }
+ }
+ return url;
+ }
+
+ public static boolean isCorrectIp(String ipString) {
+ String ipRegex = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"; //IP地址的正则表达式
+ //如果前三项判断都满足,就判断每段数字是否都位于0-255之间
+ if (ipString.matches(ipRegex)) {
+ String[] ipArray = ipString.split("\\.");
+ for (int i = 0; i < ipArray.length; i++) {
+ int number = Integer.parseInt(ipArray[i]);
+ //4.判断每段数字是否都在0-255之间
+ if (number <0||number>255) {
+ return false;
+ }
+ }
+ return true;
+ }
+ else {
+ return false; //如果与正则表达式不匹配,则返回false
+ }
+ }
+
+ private static boolean isDomainCrude(String isString) {
+ String[] strSpl = isString.split("\\.");
+ if (strSpl.length == 1)
+ return false;
+ char[] chars = strSpl[strSpl.length - 1].toCharArray();
+ boolean isPhontic;
+ for (char aChar : chars) {
+ isPhontic = (aChar >= 'a' && aChar <= 'z') || (aChar >= 'A' && aChar <= 'Z');
+ if (!isPhontic) {
+ return false;
+ }
+ }
+ return true;
}
public static String handleSNI(String thisHost) {
diff --git a/src/main/java/cn/ac/iie/config/Configurations.java b/src/main/java/cn/ac/iie/config/Configurations.java
index 4294233..4c03c01 100644
--- a/src/main/java/cn/ac/iie/config/Configurations.java
+++ b/src/main/java/cn/ac/iie/config/Configurations.java
@@ -1,17 +1,20 @@
package cn.ac.iie.config;
-import java.io.FileNotFoundException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Properties;
public class Configurations {
private static Properties propCommon = new Properties();
+ private static Logger logger = LoggerFactory.getLogger(Configurations.class);
static {
try {
propCommon.load(Configurations.class.getClassLoader().getResourceAsStream("video_data.properties"));
- System.out.println("配置文件video_data.properties加载成功");
+ logger.warn("配置文件video_data.properties加载成功");
} catch (Exception e) {
propCommon = null;
System.err.println("配置加载失败");
diff --git a/src/main/java/cn/ac/iie/intervalStatics/FeatureGenerator.java b/src/main/java/cn/ac/iie/intervalStatics/FeatureGenerator.java
index 5eeabb1..e263e97 100644
--- a/src/main/java/cn/ac/iie/intervalStatics/FeatureGenerator.java
+++ b/src/main/java/cn/ac/iie/intervalStatics/FeatureGenerator.java
@@ -23,6 +23,7 @@ import scala.collection.mutable.WrappedArray;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
+import static org.apache.spark.sql.functions.*;
public class FeatureGenerator {
static {
@@ -39,12 +40,12 @@ public class FeatureGenerator {
* 评估方式:Index avg 和 Prep pos=1
* 输出结果:每一个ysp host的向量彼此间隔,其下每一行的格式为[host, true/false, 特征向量]
*/
- public void featureExtracFromFile() {
- try (InputStream is = this.getClass().getClassLoader().getResourceAsStream("testSet.txt");
- Writer writer = new OutputStreamWriter(new FileOutputStream("feature.txt"), "UTF-8");
+ public void featureExtracFromFile(String fileName) {
+ try (InputStream is = this.getClass().getClassLoader().getResourceAsStream(fileName);
+ Writer writer = new OutputStreamWriter(new FileOutputStream("feature.txt"), StandardCharsets.UTF_8);
SqlSession sqlSession = SqlSessionFactoryUtils.getSqlSessionFactory().openSession()) {
if (is == null) {
- System.out.println("文件不存在");
+ logger.error(String.format("%s 不存在", fileName));
System.exit(0);
}
AVDataListMapper avListMapper = sqlSession.getMapper(AVDataListMapper.class);
@@ -60,7 +61,7 @@ public class FeatureGenerator {
// 先计算每一个候选集,然后判断当前host在哪个位置
// 用特征结果来表示redis的key,结尾加一个FeatExtrac后缀
- if (jedis != null && jedis.exists(yspHost) && !yspHost.startsWith("200.200.200.200")) {
+ if (jedis != null && !yspHost.startsWith("200.200.200.200")) {
// 因为之前写入的redis的都是符合条件的,所以这里可以用缓存来判断
// 如果单独运行的话,这里其实需要进行一次
writer.write(yspHost + " " + refererHost + "\r\n");
@@ -70,17 +71,17 @@ public class FeatureGenerator {
recallAndResort(yspHost, refererHost, writer, jedis, avListMapper, 1);
}
else {
- logger.warn(String.format("%s在Cache中存在", yspHost));
+ logger.warn(String.format("%s 在 Cache 中存在", yspHost));
String[] candSetArray = candSet.substring(1, candSet.length() - 1).split(", ");
for (String eachHost: candSetArray) {
String hostKey = eachHost + "_FeatExtrac";
String cacheLine = jedis.hget(yspHost, hostKey);
- if (cacheLine == null) {
- logger.error("candSet cache存在而某条feature cache不存在!");
+ if (cacheLine == null && eachHost.length() > 0) {
+ logger.error(String.format("candSet cache 存在而 %s 不存在!", hostKey));
continue;
}
// 注意cacheLine的格式
- writer.write(eachHost + "," + eachHost.equals(refererHost) + "," + cacheLine);
+ writer.write(eachHost + " " + eachHost.equals(refererHost) + " " + cacheLine);
}
}
}
@@ -99,16 +100,16 @@ public class FeatureGenerator {
* 把每一行写入训练数据中。这里为了防止host特别多,设置写入的训练数据必须在两个候选集中出现。
* 如果refer没有在两个候选集中出现,那么返回0,否则返回1
* @param writer
+ * @param allHisMap 全统计的Map,value是出现次数,注意allHisMap可能会非常大
* @param rule1Map 一项集的Map,value为次数
* @param rule2Map 规则2的Map,value也是在后面(非首部)出现了多少次
- * @param allHisMap 全统计的Map,value是出现次数,注意allHisMap可能会非常大
* @param candSet 当然是候选集啦~
*/
- private int featureWrite(String yspHost, String referHost, Writer writer, Jedis jedis,
- Map<String, Integer> allHisMap, Map<String, Long> prefixCntMap,
- Map<String, Integer[]> rule1Map, Map<String, Integer[]> rule2Map,
- Map<String, Integer> occHash, Map<String, Integer> disToYspMap,
- Set<String> candSet, float candHostSum, float hisSum, float freqPatterns) throws IOException {
+ private void featureWrite(String yspHost, String referHost, Writer writer, Jedis jedis,
+ Map<String, Integer> allHisMap, Map<String, Long> prefixCntMap,
+ Map<String, Integer[]> rule1Map, Map<String, Integer[]> rule2Map,
+ Map<String, Integer> occHash, Map<String, Integer> disToYspMap,
+ Set<String> candSet, float candHostSum, float hisSum, float freqPatterns) throws IOException {
jedis.hset(yspHost, "candSet", candSet.toString());
for (String thisHost : candSet) {
if ((rule1Map.containsKey(thisHost) || allHisMap.containsKey(thisHost)) && thisHost.length() > 0) {
@@ -138,9 +139,6 @@ public class FeatureGenerator {
}
}
- if (rule1Map.containsKey(referHost) || allHisMap.containsKey(referHost))
- return 1;
- else return 0;
}
/**
@@ -158,7 +156,7 @@ public class FeatureGenerator {
(int)(allHisMap.getOrDefault(o1, 0) + prefixCntMap.getOrDefault(o1, 0L) +
rule1Map.getOrDefault(o1, new Integer[]{0, 0})[0] -
rule2Map.getOrDefault(o1, new Integer[]{0, 0})[0])));
- jedis.hset(yspHost, "candSortedSet", candList.toString());
+ jedis.hset(yspHost, "candiSortedSet", candList.toString());
for (String thisHost : candList) {
writer.write(thisHost + "\r\n");
}
@@ -211,7 +209,11 @@ public class FeatureGenerator {
String likeUrl = '%' + yspHost + '%';
List<AVlog> aVlogs = avListMapper.getHistoryHost(likeUrl, Configurations.getIntProperty(0, "rawLimit"));
float candHostSum = aVlogs.size();
- logger.warn(String.format("%s过去曾出现过%f次", yspHost, candHostSum));
+ if (candHostSum < Configurations.getIntProperty(0, "leastHistoryNum")) {
+ logger.warn(String.format("%s 过去曾出现过 %f 次,不符合 Mining 条件", yspHost, candHostSum));
+ return;
+ }
+ else logger.warn(String.format("%s 过去曾出现过 %f 次", yspHost, candHostSum));
// 下面的几个Hash表其实key是一样的,是否应该合并一下呢
// 统计历史总量的Hash表,用于召回
@@ -280,7 +282,7 @@ public class FeatureGenerator {
int minMiNum = Configurations.getIntProperty(0, "leastHistoryNum");
if (historyCnt - overflowCols < minMiNum || data.size() < minMiNum) {
- logger.warn("不满足mining条件");
+ logger.warn("不满足 Mining 条件");
return;
}
@@ -296,12 +298,13 @@ public class FeatureGenerator {
false, Metadata.empty())
});
Dataset<Row> sequenceDF = SparkFactoryUtils.getSparkSessionFactory().createDataFrame(data, schema);
- sequenceDF.show(20, true);
- //sequenceDF.show(50, false);
- PrefixSpan prefixSpan = new PrefixSpan().setMinSupport(Configurations.getDoubleProperty(0, "prefixConfidence")).setMaxPatternLength(5);
+ // sequenceDF.show(20, true);
+ PrefixSpan prefixSpan = new PrefixSpan()
+ .setMinSupport(Configurations.getDoubleProperty(0, "prefixConfidence"))
+ .setMaxPatternLength(Configurations.getIntProperty(0, "prefixMaxPatternLength"));
// Finding frequent sequential patterns
Dataset<Row> sequence = prefixSpan.findFrequentSequentialPatterns(sequenceDF);
- sequence.show(10, false);
+ // sequence.show(10, false);
// 这里重写build candidate
HashMap<String, Integer[]> restFirstMap = new HashMap<>();
@@ -310,11 +313,29 @@ public class FeatureGenerator {
String candidateItem;
HashMap<String, Long> prefixCntMap = new HashMap<>();
float freqPatterns = 0;
- for(Iterator<Row> iter = sequence.toLocalIterator(); iter.hasNext();) {
+ // Iterator<Row> iter = sequence.toLocalIterator();
+ Dataset<Row> sequenceSorted = sequence.sort(col("freq").desc());
+
+ // TODO: 该用take前n个进行处理,防止内存溢出
+ // List<Row> out = sequenceSorted.takeAsList(100);
+ // for (Row row: out) {
+ // System.out.println(row.get(0));
+ // System.out.println(row.getStruct(0));
+ // System.out.println(row.apply(0));
+ // System.out.println(row.apply(1)); // freq
+ // System.out.println(row.get(1));
+ // System.out.println(row.toString());
+ // }
+ // System.exit(1);
+
+ Iterator<Row> iter = sequenceSorted.toLocalIterator();
+ logger.warn("序列挖掘成功!");
+ while(iter.hasNext()) {
Row each = iter.next();
if (each.size() == 2) {
freqPatterns += 1;
try {
+ // 下面的警告不重要,看不顺眼就把泛型补全
WrappedArray rawSequence = (WrappedArray)each.get(0);
WrappedArray firstItem = (WrappedArray)rawSequence.apply(0);
if (rawSequence.size() == 1) {
@@ -392,13 +413,16 @@ public class FeatureGenerator {
return candSet;
}
- public void refreshCache() {
+ /**
+ * @param fields candSet candiSortedSet
+ */
+ public void refreshCache(String... fields) {
logger.warn("清除Cache...");
Jedis jedis = JedisPoolUtils.getJedis();
assert jedis != null;
Set<String> cacheKeys = jedis.keys("*");
for (String key: cacheKeys) {
- jedis.hdel(key, "candSet", "candSortedSet");
+ jedis.hdel(key, fields);
}
JedisPoolUtils.returnResource(jedis);
}
@@ -416,15 +440,16 @@ public class FeatureGenerator {
List<AVlog> yspSet = avListMapper.getUniqAVListForAll();
// 拿到经过处理后的url以此进行序列挖掘
+ int cnt = 1;
for(AVlog yspLog: yspSet) {
String yspHost = yspLog.getUrl();
if (yspHost.length() == 0)
continue;
writer.write("-----------------------------------------------------------------------------------\r\n");
writer.write("*" + " " + yspHost + "\r\n");
- System.out.println(yspHost);
+ logger.warn(String.format("当前处理 %s,已处理 %d 条", yspHost, cnt));
assert jedis != null;
- String candSet = jedis.hget(yspHost, "candSortedSet");
+ String candSet = jedis.hget(yspHost, "candiSortedSet");
if (candSet == null) {
recallAndResort(yspHost, "", writer, jedis, avListMapper, 2);
}
@@ -435,6 +460,7 @@ public class FeatureGenerator {
}
}
writer.write("-----------------------------------------------------------------------------------\r\n");
+ cnt += 1;
}
} catch (IOException e) {
e.printStackTrace();
@@ -445,9 +471,9 @@ public class FeatureGenerator {
public static void main(String[] args) {
FeatureGenerator featureGenerator = new FeatureGenerator();
logger.warn("特征生成任务开始...");
- featureGenerator.refreshCache();
- featureGenerator.GetYspCandNoRef();
-// featureGenerator.featureExtracFromFile();
+ // featureGenerator.refreshCache("candSet");
+ // featureGenerator.GetYspCandNoRef();
+ featureGenerator.featureExtracFromFile("testSetALL.txt");
}
} \ No newline at end of file
diff --git a/src/main/java/cn/ac/iie/mining/MiningService.java b/src/main/java/cn/ac/iie/mining/MiningService.java
index 496ea65..65ab6c6 100644
--- a/src/main/java/cn/ac/iie/mining/MiningService.java
+++ b/src/main/java/cn/ac/iie/mining/MiningService.java
@@ -288,7 +288,9 @@ public class MiningService {
Dataset<Row> sequenceDF = SparkFactoryUtils.getSparkSessionFactory().createDataFrame(data, schema);
sequenceDF.show(20, false);
//sequenceDF.show(50, false);
- PrefixSpan prefixSpan = new PrefixSpan().setMinSupport(Configurations.getDoubleProperty(0, "prefixConfidence")).setMaxPatternLength(5);
+ PrefixSpan prefixSpan = new PrefixSpan()
+ .setMinSupport(Configurations.getDoubleProperty(0, "prefixConfidence"))
+ .setMaxPatternLength(Configurations.getIntProperty(0, "prefixMaxPatternLength"));
// Finding frequent sequential patterns
Dataset<Row> sequence = prefixSpan.findFrequentSequentialPatterns(sequenceDF);
@@ -363,7 +365,6 @@ public class MiningService {
* 直接用一项集的话,结果和统计是没差的,其实入口更倾向于出现在多项集的首部,而不是一项集
* 规则1:入口更倾向于出现在多项集的首部 -> 统计candidate出现在多项集首部的次数
* 规则2:入口更不倾向于出现在无序集中 -> 统计candidate出现在不是首部的比例(这一点目前来看不一定是好的,因为很多大站很可能就在后面会出现呀~)
- * TODO: 这里排序的实现需要改进,用排序序列的方法直接忽略了数值的关系,损失了部分特征,后续可以考虑用learning to rank的方法
*
* @param candidateList 一项集
* @param restMap 多项集
diff --git a/src/main/java/cn/ac/iie/mining/prefixSpark.java b/src/main/java/cn/ac/iie/mining/prefixSpark.java
index 330b6fd..0019559 100644
--- a/src/main/java/cn/ac/iie/mining/prefixSpark.java
+++ b/src/main/java/cn/ac/iie/mining/prefixSpark.java
@@ -53,7 +53,7 @@ public class prefixSpark {
// Finding frequent sequential patterns
Dataset<Row> sequence = prefixSpan.findFrequentSequentialPatterns(sequenceDF);
- sequence.show();
+ sequence.show(100, false);
// $example off$
//List<String> listOne = sequence.as(Encoders.STRING()).collectAsList();
//System.out.println(listOne);
diff --git a/src/main/resources/video_data.properties b/src/main/resources/video_data.properties
index 93182af..66aabd9 100644
--- a/src/main/resources/video_data.properties
+++ b/src/main/resources/video_data.properties
@@ -12,6 +12,8 @@ intervalRight = 15
Spark.master=spark://192.168.10.9:7077
# prefix算法的置信度
prefixConfidence=0.3
+# MaxPatternLength
+prefixMaxPatternLength=2
# 用于数据挖掘的历史时间,当前时间-最长间隔=最早使用的历史日志,注意这里的单位为天
historyTrafficInterval=180
# 候选集最大容量
diff --git a/target/classes/META-INF/VideoPortalDetection.kotlin_module b/target/classes/META-INF/VideoPortalDetection.kotlin_module
deleted file mode 100644
index a49347a..0000000
--- a/target/classes/META-INF/VideoPortalDetection.kotlin_module
+++ /dev/null
Binary files differ
diff --git a/target/classes/video_data.properties b/target/classes/video_data.properties
index 93182af..66aabd9 100644
--- a/target/classes/video_data.properties
+++ b/target/classes/video_data.properties
@@ -12,6 +12,8 @@ intervalRight = 15
Spark.master=spark://192.168.10.9:7077
# prefix算法的置信度
prefixConfidence=0.3
+# MaxPatternLength
+prefixMaxPatternLength=2
# 用于数据挖掘的历史时间,当前时间-最长间隔=最早使用的历史日志,注意这里的单位为天
historyTrafficInterval=180
# 候选集最大容量