1人参与 • 2025-03-10 • Python
本篇文章旨在自动化处理 pdf 文档,提取并清理文本数据,然后使用一种大型模型生成摘要和关键词。最后,处理结果会被整理并输出到 excel 文件中,便于后续分析和查看。
pip install pypdf2 pip install ollama
def clean_text(text): text = re.sub(r'[^\x20-\x7e]+', '', text) # 只保留可打印的 ascii 字符 return re.sub(r'\s+', ' ', text).strip() def process_pdf(pdf_path, output_path): try: with open(pdf_path, "rb") as file: reader = pypdf2.pdfreader(file) with open(output_path, "w", encoding='utf-8') as output_file: for page in reader.pages: text = page.extract_text() if text: # 检查是否成功提取文本 clean_text_result = clean_text(text) # 清理文本 output_file.write(clean_text_result + "\n") # 写入文件 else: output_file.write("未提取到有效文本\n") except filenotfounderror: print(f"文件未找到: {pdf_path}") return false except pypdf2.errors.pdfreaderror: print(f"无法读取pdf文件: {pdf_path}") return false except exception as e: print(f"处理pdf文件时发生错误: {pdf_path}, 错误信息: {e}") return false return true
# 定义超时处理异常类 class timeoutexception(exception): pass # 定义带超时功能的线程类 class timeoutthread(threading.thread): """ 允许超时处理的线程类。 """ def __init__(self, target, args=(), kwargs={}): threading.thread.__init__(self) self.target = target self.args = args self.kwargs = kwargs self.result = none self.exception = none def run(self): try: self.result = self.target(*self.args, **self.kwargs) except exception as e: self.exception = e def join(self, timeout=none): super(timeoutthread, self).join(timeout) if self.is_alive(): raise timeoutexception("处理超时") if self.exception: raise self.exception return self.result
def process_folder(folder_path, output_folder, excel_path): """ 处理指定文件夹中的所有pdf文件,并将结果保存到excel文件中。 """ if not os.path.exists(output_folder): os.makedirs(output_folder) pdf_files = glob.glob(os.path.join(folder_path, "*.pdf")) results = [] total_files = len(pdf_files) processed_files = 0 errors = [] unprocessed_files = [] for pdf_file in pdf_files: base_name = os.path.basename(pdf_file).replace(".pdf", ".txt") output_path = os.path.join(output_folder, base_name) success = process_pdf(pdf_file, output_path) if not success: errors.append(pdf_file) continue with open(output_path, "r", encoding='utf-8') as file: content = file.read() try: # 使用线程实现超时处理 def process_model(): title = base_name.split(".txt")[0] res = ollama.chat(model='qwen2.5:14b', stream=false, messages=[{"role": "user", "content": f"{content}总结成摘要和关键词"}], options={"temperature": 0}) summary = res['message']['content'].split('### 摘要\n\n')[1].split('\n\n### 关键词')[0] keywords = res['message']['content'].split('### 关键词\n\n')[1].split('\n- ')[1:] keywords = '、'.join(keywords) results.append({"文件名": title, "摘要": summary, "关键词": keywords}) print(res) timeout_thread = timeoutthread(target=process_model) timeout_thread.start() timeout_thread.join(timeout=30) except timeoutexception: print(f"处理大模型时超时: {pdf_file}") errors.append(pdf_file) except exception as e: print(f"处理大模型时发生错误: {pdf_file}, 错误信息: {e}") errors.append(pdf_file) processed_files += 1 print(f"进度: {processed_files}/{total_files} 文件已处理") # 每次处理完一个文件后保存excel文件 write_to_excel(results, excel_path) # 记录未处理的文件 unprocessed_files = pdf_files[processed_files:] return results, errors, unprocessed_files
def write_to_excel(results, excel_path): df = pd.dataframe(results) df.to_excel(excel_path, index=false)
import pypdf2 import re import ollama import os import glob import pandas as pd import threading import time # 定义函数来去除特殊空格和非法字符 def clean_text(text): # 移除特定的非法字符 text = re.sub(r'[^\x20-\x7e]+', '', text) # 只保留可打印的 ascii 字符 # 替换多个空格 return re.sub(r'\s+', ' ', text).strip() # 定义函数来处理单个pdf文件 def process_pdf(pdf_path, output_path): """ 处理单个pdf文件,提取文本并输出到指定路径。 """ try: with open(pdf_path, "rb") as file: reader = pypdf2.pdfreader(file) with open(output_path, "w", encoding='utf-8') as output_file: for page in reader.pages: text = page.extract_text() if text: # 检查是否成功提取文本 clean_text_result = clean_text(text) # 清理文本 output_file.write(clean_text_result + "\n") # 写入文件 else: output_file.write("未提取到有效文本\n") except filenotfounderror: print(f"文件未找到: {pdf_path}") return false except pypdf2.errors.pdfreaderror: print(f"无法读取pdf文件: {pdf_path}") return false except exception as e: print(f"处理pdf文件时发生错误: {pdf_path}, 错误信息: {e}") return false return true # 定义超时处理异常类 class timeoutexception(exception): pass # 定义带超时功能的线程类 class timeoutthread(threading.thread): """ 允许超时处理的线程类。 """ def __init__(self, target, args=(), kwargs={}): threading.thread.__init__(self) self.target = target self.args = args self.kwargs = kwargs self.result = none self.exception = none def run(self): try: self.result = self.target(*self.args, **self.kwargs) except exception as e: self.exception = e def join(self, timeout=none): super(timeoutthread, self).join(timeout) if self.is_alive(): raise timeoutexception("处理超时") if self.exception: raise self.exception return self.result # 定义函数来处理文件夹中的所有pdf文件 def process_folder(folder_path, output_folder, excel_path): """ 处理指定文件夹中的所有pdf文件,并将结果保存到excel文件中。 """ if not os.path.exists(output_folder): os.makedirs(output_folder) pdf_files = glob.glob(os.path.join(folder_path, "*.pdf")) results = [] total_files = len(pdf_files) processed_files = 0 errors = [] unprocessed_files = [] for pdf_file in pdf_files: base_name = os.path.basename(pdf_file).replace(".pdf", ".txt") output_path = os.path.join(output_folder, base_name) success = process_pdf(pdf_file, output_path) if not success: errors.append(pdf_file) continue with open(output_path, "r", encoding='utf-8') as file: content = file.read() try: # 使用线程实现超时处理 def process_model(): title = base_name.split(".txt")[0] res = ollama.chat(model='qwen2.5:14b', stream=false, messages=[{"role": "user", "content": f"{content}总结成摘要和关键词"}], options={"temperature": 0}) summary = res['message']['content'].split('### 摘要\n\n')[1].split('\n\n### 关键词')[0] keywords = res['message']['content'].split('### 关键词\n\n')[1].split('\n- ')[1:] keywords = '、'.join(keywords) results.append({"文件名": title, "摘要": summary, "关键词": keywords}) print(res) timeout_thread = timeoutthread(target=process_model) timeout_thread.start() timeout_thread.join(timeout=30) except timeoutexception: print(f"处理大模型时超时: {pdf_file}") errors.append(pdf_file) except exception as e: print(f"处理大模型时发生错误: {pdf_file}, 错误信息: {e}") errors.append(pdf_file) processed_files += 1 print(f"进度: {processed_files}/{total_files} 文件已处理") # 每次处理完一个文件后保存excel文件 write_to_excel(results, excel_path) # 记录未处理的文件 unprocessed_files = pdf_files[processed_files:] return results, errors, unprocessed_files # 定义函数来将结果写入excel文件 def write_to_excel(results, excel_path): """ 将处理结果写入指定的excel文件。 """ df = pd.dataframe(results) df.to_excel(excel_path, index=false) # 主程序 if __name__ == "__main__": a = input("pdf文件夹路径:") b = input("txt文件输出路径:") c = input("excel文件输出路径:") folder_path = fr"{a}" # 文件夹路径 output_folder = fr"{b}" # txt文件输出路径 excel_path = fr"{c}\results.xlsx" # excel文件输出路径 results, errors, unprocessed_files = process_folder(folder_path, output_folder, excel_path) print(f"所有pdf文件已处理完毕,结果已保存到 {excel_path}") if errors: print("以下pdf文件处理失败:") for error in errors: print(error) if unprocessed_files: print("以下pdf文件未处理:") for unprocessed in unprocessed_files: print(unprocessed)
以上就是python调用ollama本地大模型进行批量识别pdf的详细内容,更多关于python ollama识别pdf的资料请关注代码网其它相关文章!
版权声明:本文内容由互联网用户贡献,该文观点仅代表作者本人。本站仅提供信息存储服务,不拥有所有权,不承担相关法律责任。 如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至 2386932994@qq.com 举报,一经查实将立刻删除。