python 中文相似度:文本相似性算法Python实现对比
python 中文相似度:文本相似性算法Python实现对比
# -*- coding: utf-8 -*-
"""
以下是工具api的一些使用样例
"""
from text_analysis_tools.api.text_cluster.Kmeans import KmeansClustering
from text_analysis_tools.api.text_cluster.dbscan import DbscanClustering
from text_analysis_tools.api.text_similarity.cosion import CosionSimilarity
from text_analysis_tools.api.text_similarity.edit import EditSimilarity
from text_analysis_tools.api.text_similarity.simhash import SimHashSimilarity
from text_analysis_tools.api.keywords.tfidf import TfidfKeywords
from text_analysis_tools.api.keywords.textrank import TextRankKeywords
from text_analysis_tools.api.keyphrase.keyphrase import KeyPhraseExtraction
from text_analysis_tools.api.sentiment.sentiment import SentimentAnalysis
from text_analysis_tools.api.spell_correct.spell_correct import SpellCorrect
from text_analysis_tools.api.summarization.tfidf_summarization import TfidfSummarization
from text_analysis_tools.api.summarization.textrank_summarization import TextRankSummarization
from text_analysis_tools.api.topic_keywords.topic_kwywords import TopicKeywords
from text_analysis_tools.api.text_classification.fasttext import FastText
from text_analysis_tools.api.synonym.word2vec import Word2VecSynonym
from text_analysis_tools.api.synonym.synonym_dict import SynonymDict
"""
文本相似性
cosion_sismilarity
edit_similarity
simhash_similarity
"""
def cosion_sismilarity():
"""
基于余弦计算文本相似性
:return: 余弦值
"""
text1 = "小明,你妈妈喊你回家吃饭啦"
text2 = "回家吃饭啦,小明"
cosion = CosionSimilarity()
similiar = cosion.similarity(text1 text2)
print("cosion similarity result: {}\n".format(similiar))
def edit_similarity():
"""
采用编辑距离计算文本之间的相似性
:return: 编辑距离
"""
edit = EditSimilarity()
edit_dis = edit.edit_dist("abc" "ab")
print("edit distance: {}\n".format(edit_dis))
def simhash_similarity():
"""
采用simhash计算文本之间的相似性
:return:
"""
simhash = SimHashSimilarity()
sim = simhash.run_simhash("你妈妈叫你回家吃饭了,小明" "小明 妈妈让你回家吃饭了")
print("simhash result: {}\n".format(sim))
实验结果如下:
可见,基于余弦计算文本的相似性计算更为可靠,建议大家选用。