利用文本挖掘技术来找出网络中的“小鲜词”
发布时间:2021-01-18 13:09:28 所属栏目:大数据 来源:网络整理
导读:副标题#e# 开始之前,先看一下从人人网中发现的90后用户爱用的词 是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构 当然,text.da
|
EntropyJudger.java计算熵值 package grid.text.evolution;
import grid.common.CountMap;
import grid.common.TextUtils;
import grid.text.index.Pos;
import grid.text.index.TextIndexer;
public class EntropyJudger {
private TextIndexer indexer;
/** * A word least appeared count */
private static int LEAST_COUNT_THRESHOLD = 5; //阈值
/** * Threshold for solid rate calculated by word appeared count and every * single letter. * * The smaller this values is,more new words you will get,but with less * accuracy. The greater this value is,less new words you will get,but * with high accuracy. */
private static double SOLID_RATE_THRESHOLD = 0.018;
/** * Threshold for entropy value calculated by candidate word prefix character * count and suffix character count * * The smaller this values is,but * with high accuracy. */
private static double ENTROPY_THRESHOL = 1.92;
public EntropyJudger(TextIndexer indexer) {
this.indexer = indexer;
}
public boolean judge(String candidate) {
double solidRate = getSolidRate(candidate);
if (solidRate < SOLID_RATE_THRESHOLD) {
return false;
}
double entropy = getEntropy(candidate);
if (entropy < ENTROPY_THRESHOL) {
return false;
}
return true;
}
private double getEntropy(String candidate) {
Pos pos = new Pos(candidate);
CountMap<Character> frontCountMap = new CountMap<Character>();
CountMap<Character> backCountMap = new CountMap<Character>();
final int candidateLen = candidate.length();
int off = 0;
char c;
double rate,frontEntropy = 0,backEntropy = 0;
while (indexer.find(pos).isFound()) {
off = pos.getPos();
c = indexer.charAt(off - 1);
if (TextUtils.isCnLetter(c)) {
frontCountMap.increase(c);
}
c = indexer.charAt(off + candidateLen);
if (TextUtils.isCnLetter(c)) {
backCountMap.increase(c);
}
}
for (char key : frontCountMap.keySet()) {
rate = (double) frontCountMap.get(key) / frontCountMap.count();
frontEntropy -= rate * Math.log(rate);
}
for (char key : backCountMap.keySet()) {
rate = (double) backCountMap.get(key) / backCountMap.count();
backEntropy -= rate * Math.log(rate);
}
return frontEntropy > backEntropy ? backEntropy : frontEntropy;
}
/** * @param candidate * @return */
public double getSolidRate(String candidate) {
final int candidateLen = candidate.length();
if (candidateLen < 2) {
return 1;
}
final int count = indexer.count(candidate);
double rate = 1;
if (count < LEAST_COUNT_THRESHOLD) {
return 0;
}
for (int i = 0; i < candidateLen; i++) {
rate *= (double) count / indexer.count("" + candidate.charAt(i));
}
return Math.pow(rate,1D / candidateLen) * Math.sqrt(candidateLen);
}
public void setIndexer(TextIndexer indexer) {
this.indexer = indexer;
}
}
NewWordDiscover.java(编辑:PHP编程网 - 湛江站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
站长推荐


