5人参与 • 2025-07-28 • Python
基于文本内容比较的相似度检测工具
以下是一个完整的文件相似度检测函数实现,支持多种相似度算法和阈值判断:
import difflib from pathlib import path import re from collections import counter import math import string def are_files_similar( file1: str | path, file2: str | path, similarity_threshold: float = 0.8, method: str = "cosine" ) -> bool: """ 比较两个文件的相似度是否超过指定阈值 参数: file1: 第一个文件路径 file2: 第二个文件路径 similarity_threshold: 相似度阈值 (0-1) method: 相似度计算方法 'cosine' - 余弦相似度 (默认) 'jaccard' - jaccard相似度 'levenshtein' - 编辑距离相似度 'sequence' - 序列匹配相似度 返回: bool: 相似度是否超过阈值 """ # 读取文件内容 content1 = _read_file(file1) content2 = _read_file(file2) # 空文件处理 if not content1 and not content2: return true # 两个空文件视为相同 # 选择计算方法 if method == "cosine": similarity = _cosine_similarity(content1, content2) elif method == "jaccard": similarity = _jaccard_similarity(content1, content2) elif method == "levenshtein": similarity = _levenshtein_similarity(content1, content2) elif method == "sequence": similarity = _sequence_similarity(content1, content2) else: raise valueerror(f"未知的相似度计算方法: {method}") return similarity >= similarity_threshold def _read_file(file_path: str | path) -> str: """读取文件内容并进行预处理""" path = path(file_path) if not path.exists(): raise filenotfounderror(f"文件不存在: {path}") # 读取文件内容 try: with open(path, 'r', encoding='utf-8') as f: content = f.read() except unicodedecodeerror: # 尝试其他编码 with open(path, 'r', encoding='latin-1') as f: content = f.read() # 基础预处理 content = content.lower() content = re.sub(r'\s+', ' ', content) # 合并连续空白 return content.strip() def _cosine_similarity(text1: str, text2: str) -> float: """计算余弦相似度""" # 创建词频向量 vec1 = counter(_tokenize(text1)) vec2 = counter(_tokenize(text2)) # 获取所有唯一词 words = set(vec1.keys()) | set(vec2.keys()) # 创建向量 vector1 = [vec1.get(word, 0) for word in words] vector2 = [vec2.get(word, 0) for word in words] # 计算点积 dot_product = sum(v1 * v2 for v1, v2 in zip(vector1, vector2)) # 计算模长 magnitude1 = math.sqrt(sum(v**2 for v in vector1)) magnitude2 = math.sqrt(sum(v**2 for v in vector2)) # 避免除以零 if magnitude1 == 0 or magnitude2 == 0: return 0.0 return dot_product / (magnitude1 * magnitude2) def _jaccard_similarity(text1: str, text2: str) -> float: """计算jaccard相似度""" set1 = set(_tokenize(text1)) set2 = set(_tokenize(text2)) intersection = len(set1 & set2) union = len(set1 | set2) if union == 0: return 1.0 # 两个空集 return intersection / union def _levenshtein_similarity(text1: str, text2: str) -> float: """基于编辑距离的相似度""" # 计算编辑距离 n, m = len(text1), len(text2) if n == 0 or m == 0: return 0.0 # 创建距离矩阵 d = [[0] * (m + 1) for _ in range(n + 1)] # 初始化边界 for i in range(n + 1): d[i][0] = i for j in range(m + 1): d[0][j] = j # 计算距离 for i in range(1, n + 1): for j in range(1, m + 1): cost = 0 if text1[i - 1] == text2[j - 1] else 1 d[i][j] = min( d[i - 1][j] + 1, # 删除 d[i][j - 1] + 1, # 插入 d[i - 1][j - 1] + cost # 替换 ) distance = d[n][m] max_len = max(n, m) return 1 - (distance / max_len) def _sequence_similarity(text1: str, text2: str) -> float: """基于序列匹配的相似度""" matcher = difflib.sequencematcher(none, text1, text2) return matcher.ratio() def _tokenize(text: str) -> list[str]: """文本分词处理""" # 移除标点 text = text.translate(str.maketrans('', '', string.punctuation)) # 分词 return text.split()
# 比较两个文件是否相似度超过80% result = are_files_similar("file1.txt", "file2.txt", 0.8) print(f"文件相似: {result}")
# 使用jaccard相似度 result = are_files_similar("doc1.md", "doc2.md", method="jaccard") # 使用编辑距离相似度 result = are_files_similar("code1.py", "code2.py", method="levenshtein")
def find_similar_files(directory, threshold=0.9): """查找目录中相似的文件对""" from itertools import combinations files = list(path(directory).glob("*")) similar_pairs = [] for file1, file2 in combinations(files, 2): if are_files_similar(file1, file2, threshold): similar_pairs.append((file1.name, file2.name)) return similar_pairs
算法 | 适用场景 | 特点 |
---|---|---|
余弦相似度 | 长文档、自然语言 | 考虑词频,忽略词序 |
jaccard相似度 | 短文本、关键词匹配 | 基于集合运算 |
编辑距离相似度 | 代码、配置文件 | 考虑字符级差异 |
序列匹配相似度 | 通用文本 | python内置算法 |
def _read_large_file(file_path: path) -> str: """分块读取大文件""" content = [] with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: while true: chunk = f.read(65536) # 64kb块 if not chunk: break content.append(chunk.lower()) return ' '.join(content)
def _jaccard_similarity_large(text1: str, text2: str) -> float: """适用于大文件的jaccard相似度""" # 使用最小哈希(minhash)近似计算 from datasketch import minhash # 创建minhash对象 m1 = minhash(num_perm=128) m2 = minhash(num_perm=128) # 添加词元 for word in set(_tokenize(text1)): m1.update(word.encode('utf-8')) for word in set(_tokenize(text2)): m2.update(word.encode('utf-8')) return m1.jaccard(m2)
from concurrent.futures import threadpoolexecutor def batch_compare(file_pairs, threshold=0.8): """并行批量比较文件""" results = {} with threadpoolexecutor() as executor: futures = { (pair[0].name, pair[1].name): executor.submit( are_files_similar, pair[0], pair[1], threshold ) for pair in file_pairs } for names, future in futures.items(): results[names] = future.result() return results
def check_plagiarism(submitted_file, source_files, threshold=0.7): """检查文档抄袭""" for source in source_files: if are_files_similar(submitted_file, source, threshold): print(f"检测到与 {source} 相似") return true return false
def detect_code_clones(repo_path): """检测代码库中的相似代码片段""" code_files = list(path(repo_path).rglob("*.py")) clones = [] for file1, file2 in combinations(code_files, 2): if are_files_similar(file1, file2, 0.85, method="levenshtein"): clones.append((file1, file2)) return clones
def find_most_similar_version(target_file, versions): """在多个版本中查找最相似的文件""" similarities = [] for version_file in versions: sim = are_files_similar(target_file, version_file, method="sequence") similarities.append((version_file, sim)) # 按相似度排序 return sorted(similarities, key=lambda x: x[1], reverse=true)[0]
import unittest import tempfile class testfilesimilarity(unittest.testcase): def setup(self): # 创建临时文件 self.file1 = tempfile.namedtemporaryfile(delete=false, mode='w+') self.file2 = tempfile.namedtemporaryfile(delete=false, mode='w+') self.file3 = tempfile.namedtemporaryfile(delete=false, mode='w+') # 写入内容 self.file1.write("this is a test file for similarity comparison.") self.file2.write("this is a test file for similarity comparison.") self.file3.write("this is a completely different file content.") # 确保写入磁盘 self.file1.flush() self.file2.flush() self.file3.flush() def test_identical_files(self): self.asserttrue(are_files_similar(self.file1.name, self.file2.name)) def test_different_files(self): self.assertfalse(are_files_similar(self.file1.name, self.file3.name, 0.8)) def test_empty_files(self): with tempfile.namedtemporaryfile(mode='w+') as empty1, \ tempfile.namedtemporaryfile(mode='w+') as empty2: self.asserttrue(are_files_similar(empty1.name, empty2.name)) def test_various_methods(self): # 相同文件应所有方法都返回高相似度 self.assertalmostequal( are_files_similar(self.file1.name, self.file2.name, 0.0, "cosine"), 1.0, delta=0.01 ) self.assertalmostequal( are_files_similar(self.file1.name, self.file2.name, 0.0, "jaccard"), 1.0, delta=0.01 ) self.assertalmostequal( are_files_similar(self.file1.name, self.file2.name, 0.0, "levenshtein"), 1.0, delta=0.01 ) self.assertalmostequal( are_files_similar(self.file1.name, self.file2.name, 0.0, "sequence"), 1.0, delta=0.01 ) def teardown(self): # 清理临时文件 path(self.file1.name).unlink() path(self.file2.name).unlink() path(self.file3.name).unlink() if __name__ == "__main__": unittest.main()
这个文件相似度检测函数提供了:
使用示例:
# 基本使用 result = are_files_similar("file1.txt", "file2.txt", 0.75) # 指定算法 result = are_files_similar("doc1.md", "doc2.md", method="jaccard")
通过这个函数,您可以轻松实现:
以上就是基于python实现一个文件相似度检测工具的详细内容,更多关于python文件相似度检测的资料请关注代码网其它相关文章!
您想发表意见!!点此发布评论
版权声明:本文内容由互联网用户贡献,该文观点仅代表作者本人。本站仅提供信息存储服务,不拥有所有权,不承担相关法律责任。 如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至 2386932994@qq.com 举报,一经查实将立刻删除。
发表评论