11人参与 • 2025-04-25 • Python
以下是一份关于 transformers
库的全面讲解,包含基础知识、高级用法、案例代码及学习路径。内容经过组织,适合不同阶段的学习者。
tokenizer
:文本分词与编码model
:神经网络模型架构pipeline
:快速推理的封装接口pip install transformers torch datasets
from transformers import pipeline # 使用情感分析流水线 classifier = pipeline("sentiment-analysis") result = classifier("i love programming with transformers!") print(result) # [{'label': 'positive', 'score': 0.9998}]
from transformers import autotokenizer tokenizer = autotokenizer.from_pretrained("bert-base-uncased") text = "hello, world!" encoded = tokenizer(text, padding=true, truncation=true, return_tensors="pt") # 返回pytorch张量 print(encoded) # {'input_ids': tensor([[101, 7592, 1010, 2088, 999, 102]]), # 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
from transformers import automodel model = automodel.from_pretrained("bert-base-uncased") outputs = model(**encoded) # 前向传播 last_hidden_states = outputs.last_hidden_state
from transformers import bertforsequenceclassification, trainer, trainingarguments from datasets import load_dataset # 加载数据集 dataset = load_dataset("imdb") tokenized_datasets = dataset.map( lambda x: tokenizer(x["text"], padding=true, truncation=true), batched=true ) # 定义模型 model = bertforsequenceclassification.from_pretrained("bert-base-uncased", num_labels=2) # 训练参数配置 training_args = trainingarguments( output_dir="./results", num_train_epochs=3, per_device_train_batch_size=8, evaluation_strategy="epoch" ) # 训练器配置 trainer = trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"] ) # 开始训练 trainer.train()
model.save_pretrained("./my_model") tokenizer.save_pretrained("./my_model") # 加载自定义模型 new_model = automodel.from_pretrained("./my_model")
from transformers import bertmodel, berttokenizer import torch model = bertmodel.from_pretrained("bert-base-uncased", output_attentions=true) inputs = tokenizer("the cat sat on the mat", return_tensors="pt") outputs = model(**inputs) # 提取第0层的注意力权重 attention = outputs.attentions[0][0] print(attention.shape) # [num_heads, seq_len, seq_len]
from transformers import trainingarguments training_args = trainingarguments( fp16=true, # 启用混合精度 ... )
from transformers import pipeline # 加载ner流水线 ner_pipeline = pipeline("ner", model="dslim/bert-base-ner") text = "apple was founded by steve jobs in cupertino." results = ner_pipeline(text) # 结果可视化 for entity in results: print(f"{entity['word']} -> {entity['entity']} (confidence: {entity['score']:.2f})")
入门阶段:
pipeline
和基础模型使用中级阶段:
高级阶段:
必读论文:
实践项目:
社区资源:
在训练过程中动态调整学习率,防止梯度爆炸:
from transformers import trainingarguments training_args = trainingarguments( output_dir="./results", learning_rate=2e-5, weight_decay=0.01, warmup_steps=500, # 学习率预热步数 gradient_accumulation_steps=2, # 梯度累积(节省显存) gradient_clipping=1.0, # 梯度裁剪阈值 ... )
import torch from transformers import bertforsequenceclassification class custommodel(bertforsequenceclassification): def __init__(self, config): super().__init__(config) def forward(self, input_ids, attention_mask, labels=none): outputs = super().forward(input_ids, attention_mask) logits = outputs.logits if labels is not none: loss_fct = torch.nn.crossentropyloss(weight=torch.tensor([1.0, 2.0])) # 类别权重 loss = loss_fct(logits.view(-1, 2), labels.view(-1)) return {"loss": loss, "logits": logits} return outputs
from transformers import gpt2lmheadmodel, gpt2tokenizer tokenizer = gpt2tokenizer.from_pretrained("gpt2") model = gpt2lmheadmodel.from_pretrained("gpt2") prompt = "in a world where ai dominates," input_ids = tokenizer.encode(prompt, return_tensors="pt") # 生成文本(配置生成参数) output = model.generate( input_ids, max_length=100, temperature=0.7, # 控制随机性(低值更确定) top_k=50, # 限制候选词数量 num_return_sequences=3 # 生成3个不同结果 ) for seq in output: print(tokenizer.decode(seq, skip_special_tokens=true))
from transformers import pipeline qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") context = """ hugging face is a company based in new york city. its transformers library is widely used in nlp. """ question = "where is hugging face located?" result = qa_pipeline(question=question, context=context) print(f"answer: {result['answer']} (score: {result['score']:.2f})") # answer: new york city (score: 0.92)
from transformers import bertmodel, autotokenizer import torch model = bertmodel.from_pretrained("bert-base-uncased") quantized_model = torch.quantization.quantize_dynamic( model, {torch.nn.linear}, # 量化所有线性层 dtype=torch.qint8 ) # 量化后推理速度提升2-4倍,模型体积减少约75%
from transformers import berttokenizer, bertforsequenceclassification from torch.onnx import export model = bertforsequenceclassification.from_pretrained("bert-base-uncased") tokenizer = berttokenizer.from_pretrained("bert-base-uncased") # 示例输入 dummy_input = tokenizer("this is a test", return_tensors="pt") # 导出为onnx export( model, (dummy_input["input_ids"], dummy_input["attention_mask"]), "model.onnx", opset_version=13, input_names=["input_ids", "attention_mask"], output_names=["logits"], dynamic_axes={"input_ids": {0: "batch"}, "attention_mask": {0: "batch"}} )
import torch # 在训练循环中插入显存监控 print(f"allocated: {torch.cuda.memory_allocated() / 1e9:.2f} gb") print(f"cached: {torch.cuda.memory_reserved() / 1e9:.2f} gb")
from torch.profiler import profile, record_function, profileractivity with profile(activities=[profileractivity.cuda], record_shapes=true) as prof: outputs = model(**inputs) print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
from transformers import mbartforconditionalgeneration, mbart50tokenizerfast model = mbartforconditionalgeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") tokenizer = mbart50tokenizerfast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") # 中文转英文 tokenizer.src_lang = "zh_cn" text = "欢迎使用transformers库" encoded = tokenizer(text, return_tensors="pt") generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.lang_code_to_id["en_xx"]) print(tokenizer.batch_decode(generated_tokens, skip_special_tokens=true)) # ['welcome to the transformers library']
from pil import image from transformers import clipprocessor, clipmodel model = clipmodel.from_pretrained("openai/clip-vit-base-patch32") processor = clipprocessor.from_pretrained("openai/clip-vit-base-patch32") image = image.open("cat.jpg") text = ["a photo of a cat", "a photo of a dog"] inputs = processor(text=text, images=image, return_tensors="pt", padding=true) outputs = model(**inputs) # 计算图文相似度 logits_per_image = outputs.logits_per_image probs = logits_per_image.softmax(dim=1) # 概率分布
实现一个简化版 transformer:
import torch.nn as nn class transformerblock(nn.module): def __init__(self, d_model=512, nhead=8): super().__init__() self.attention = nn.multiheadattention(d_model, nhead) self.linear = nn.linear(d_model, d_model) self.norm = nn.layernorm(d_model) def forward(self, x): attn_output, _ = self.attention(x, x, x) x = x + attn_output x = self.norm(x) x = x + self.linear(x) return x
解决方案:
batch_size
gradient_accumulation_steps
)fp16=true
)torch.cuda.empty_cache()
from transformers import berttokenizer tokenizer = berttokenizer.from_pretrained("bert-base-chinese") # 手动添加特殊词汇 tokenizer.add_tokens(["【特殊词】"]) # 调整模型嵌入层 model.resize_token_embeddings(len(tokenizer))
以下继续扩展关于 transformers
库的深度应用内容,涵盖更多实际场景、前沿技术及工业级实践方案。
from transformers import llamaforcausallm, llamatokenizer, trainingarguments # 加载模型和分词器(需申请权限) model = llamaforcausallm.from_pretrained("decapoda-research/llama-7b-hf") tokenizer = llamatokenizer.from_pretrained("decapoda-research/llama-7b-hf") # 低秩适配(lora)微调 from peft import get_peft_model, loraconfig lora_config = loraconfig( r=8, # 低秩维度 lora_alpha=32, target_modules=["q_proj", "v_proj"], # 仅微调部分模块 lora_dropout=0.05, bias="none" ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # 显示可训练参数占比(通常 <1%) # 继续配置训练参数...
# 使用 trl 库进行 rlhf 训练 from trl import ppotrainer, automodelforcausallmwithvaluehead model = automodelforcausallmwithvaluehead.from_pretrained("gpt2") ppo_trainer = ppotrainer( model=model, config=training_args, dataset=dataset, tokenizer=tokenizer ) # 定义奖励模型 for epoch in range(3): for batch in ppo_trainer.dataloader: # 生成响应 response_tensors = model.generate(batch["input_ids"]) # 计算奖励(需自定义奖励函数) rewards = calculate_rewards(response_tensors, batch) # ppo 优化步骤 ppo_trainer.step( response_tensors, rewards, batch["attention_mask"] )
from transformers import trainingarguments # 配置分布式训练 training_args = trainingarguments( per_device_train_batch_size=4, gradient_accumulation_steps=8, fp16=true, tpu_num_cores=8, # 使用tpu时指定核心数 dataloader_num_workers=4, deepspeed="./configs/deepspeed_config.json" # 使用deepspeed优化 ) # deepspeed 配置文件示例(ds_config.json): { "fp16": { "enabled": true }, "optimizer": { "type": "adamw", "params": { "lr": 3e-5 } }, "zero_optimization": { "stage": 3 # 启用zero-3优化 } }
from fastapi import fastapi from pydantic import basemodel from transformers import pipeline app = fastapi() generator = pipeline("text-generation", model="gpt2") class request(basemodel): text: str max_length: int = 100 @app.post("/generate") async def generate_text(request: request): result = generator(request.text, max_length=request.max_length) return {"generated_text": result[0]["generated_text"]} # 启动服务:uvicorn main:app --port 8000
from transformers import autotokenizer, automodelforquestionanswering tokenizer = autotokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") model = automodelforquestionanswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") def process_long_text(context, question, max_length=384, stride=128): # 分块处理长文本 inputs = tokenizer( question, context, max_length=max_length, truncation="only_second", stride=stride, return_overflowing_tokens=true, return_offsets_mapping=true ) # 对各块推理并合并结果 best_score = 0 best_answer = "" for i in range(len(inputs["input_ids"])): outputs = model(**{k: torch.tensor([v[i]]) for k, v in inputs.items()}) answer_start = torch.argmax(outputs.start_logits) answer_end = torch.argmax(outputs.end_logits) + 1 score = (outputs.start_logits[answer_start] + outputs.end_logits[answer_end-1]).item() if score > best_score: best_score = score best_answer = tokenizer.decode(inputs["input_ids"][i][answer_start:answer_end]) return best_answer
# 使用 xlm-roberta 进行跨语言迁移 from transformers import xlmrobertatokenizer, xlmrobertaforsequenceclassification tokenizer = xlmrobertatokenizer.from_pretrained("xlm-roberta-base") model = xlmrobertaforsequenceclassification.from_pretrained("xlm-roberta-base") # 通过少量样本微调(代码与bert训练类似)
from captum.attr import layerintegratedgradients from transformers import bertforsequenceclassification model = bertforsequenceclassification.from_pretrained("bert-base-uncased") def forward_func(input_ids, attention_mask): return model(input_ids, attention_mask).logits lig = layerintegratedgradients(forward_func, model.bert.embeddings) # 计算输入词重要性 attributions, delta = lig.attribute( inputs=input_ids, baselines=tokenizer.pad_token_id * torch.ones_like(input_ids), additional_forward_args=attention_mask, return_convergence_delta=true ) # 可视化结果 import matplotlib.pyplot as plt plt.bar(range(len(attributions[0])), attributions[0].detach().numpy()) plt.xticks(ticks=range(len(tokens)), labels=tokens, rotation=90) plt.show()
import spacy from spacy_transformers import transformerslanguage, transformerswordpiecer # 创建spacy管道 nlp = transformerslanguage(trf_name="bert-base-uncased") # 自定义组件 @spacy.registry.architectures("customclassifier.v1") def create_classifier(transformer, tok2vec, n_classes): return transformerstextcategorizer(transformer, tok2vec, n_classes) # 在spacy中直接使用transformer模型 doc = nlp("this is a text to analyze.") print(doc._.trf_last_hidden_state.shape) # [seq_len, hidden_dim]
import gradio as gr from transformers import pipeline ner_pipeline = pipeline("ner") def extract_entities(text): results = ner_pipeline(text) return {"text": text, "entities": [ {"entity": res["entity"], "start": res["start"], "end": res["end"]} for res in results ]} gr.interface( fn=extract_entities, inputs=gr.textbox(lines=5), outputs=gr.highlightedtext() ).launch()
跟踪最新进展:
实战项目进阶:
系统优化方向:
以下继续扩展关于 transformers
库的终极实践指南,涵盖生产级优化、前沿模型架构、领域专用方案及伦理考量。
# 使用 nn_pruning 进行结构化剪枝 from transformers import bertforsequenceclassification from nn_pruning import modelpruning model = bertforsequenceclassification.from_pretrained("bert-base-uncased") pruner = modelpruning( model, target_sparsity=0.5, # 剪枝50%的注意力头 pattern="block_sparse" # 结构化剪枝模式 ) # 执行剪枝并微调 pruned_model = pruner.prune() pruned_model.save_pretrained("./pruned_bert") # 知识蒸馏(教师→学生模型) from transformers import distilbertforsequenceclassification, distilberttokenizer teacher = bertforsequenceclassification.from_pretrained("bert-base-uncased") student = distilbertforsequenceclassification.from_pretrained("distilbert-base-uncased") # 使用蒸馏训练器 from transformers import distillationtrainingarguments, distillationtrainer training_args = distillationtrainingarguments( output_dir="./distilled", temperature=2.0, # 软化概率分布 alpha_ce=0.5, # 交叉熵损失权重 alpha_mse=0.5 # 隐藏层mse损失权重 ) trainer = distillationtrainer( teacher=teacher, student=student, args=training_args, train_dataset=tokenized_datasets["train"], tokenizer=tokenizer ) trainer.train()
# 转换模型为tensorrt引擎 trtexec --onnx=model.onnx --saveengine=model.trt --fp16
# python 调用tensorrt引擎 import tensorrt as trt import pycuda.driver as cuda runtime = trt.runtime(trt.logger(trt.logger.warning)) with open("model.trt", "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() # 绑定输入输出缓冲区进行推理
from transformers import autotokenizer, automodelfortokenclassification tokenizer = autotokenizer.from_pretrained("dmis-lab/biobert-v1.1") model = automodelfortokenclassification.from_pretrained("dmis-lab/biobert-v1.1") text = "the patient exhibited egfr mutations and responded to osimertinib." inputs = tokenizer(text, return_tensors="pt") outputs = model(**inputs).logits # 提取基因实体 predictions = torch.argmax(outputs, dim=2) print([tokenizer.decode([token]) for token in inputs.input_ids[0]]) print(predictions.tolist()) # bio标注结果
# 合同条款分类 from transformers import berttokenizer, bertforsequenceclassification tokenizer = berttokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased") model = bertforsequenceclassification.from_pretrained("nlpaueb/legal-bert-base-uncased") clause = "the parties hereby agree to arbitrate all disputes in accordance with icc rules." inputs = tokenizer(clause, return_tensors="pt", truncation=true, padding=true) outputs = model(**inputs) predicted_class = torch.argmax(outputs.logits).item() # 0: 仲裁条款, 1: 保密条款等
from transformers import bertforsequenceclassification import coremltools as ct model = bertforsequenceclassification.from_pretrained("bert-base-uncased") tokenizer = berttokenizer.from_pretrained("bert-base-uncased") # 转换模型 traced_model = torch.jit.trace(model, (input_ids, attention_mask)) mlmodel = ct.convert( traced_model, inputs=[ ct.tensortype(name="input_ids", shape=input_ids.shape), ct.tensortype(name="attention_mask", shape=attention_mask.shape) ] ) mlmodel.save("bertsenti.mlmodel")
from transformers import tfbertforsequenceclassification import tensorflow as tf model = tfbertforsequenceclassification.from_pretrained("bert-base-uncased") # 转换为tflite converter = tf.lite.tfliteconverter.from_keras_model(model) converter.optimizations = [tf.lite.optimize.default] # 动态范围量化 tflite_model = converter.convert() with open("model_quant.tflite", "wb") as f: f.write(tflite_model)
from transformers import pipeline from fairness_metrics import demographic_parity # 检测模型偏见 classifier = pipeline("text-classification", model="bert-base-uncased") protected_groups = { "gender": ["she", "he"], "race": ["african", "european"] } bias_scores = {} for category, terms in protected_groups.items(): texts = [f"{term} is qualified for this position" for term in terms] results = classifier(texts) bias_scores[category] = demographic_parity(results)
from textattack import attackrecipe from textattack.models.wrappers import huggingfacemodelwrapper model_wrapper = huggingfacemodelwrapper(model, tokenizer) attack = attackrecipe.build("bae") # bae攻击方法 # 生成对抗样本 attack_args = textattack.attackargs(num_examples=5) attacker = textattack.attacker(attack, model_wrapper, attack_args) attack_results = attacker.attack_dataset(dataset)
from transformers import longformermodel model = longformermodel.from_pretrained("allenai/longformer-base-4096") inputs = tokenizer("this is a very long document..."*1000, return_tensors="pt") outputs = model(**inputs) # 支持最长4096 tokens
# 使用switch transformers from transformers import switchtransformersforconditionalgeneration model = switchtransformersforconditionalgeneration.from_pretrained("google/switch-base-8") outputs = model.generate( input_ids, expert_choice_mask=true, # 追踪专家路由 ) print(outputs.expert_choices) # 显示每个token使用的专家
""" 端到端文本分类系统架构: 1. 数据采集 → 2. 清洗 → 3. 标注 → 4. 模型训练 → 5. 评估 → 6. 部署 → 7. 监控 """ # 步骤4的增强训练流程 from transformers import trainercallback class customcallback(trainercallback): def on_log(self, args, state, control, logs=none, **kwargs): # 实时记录指标到prometheus prometheus_logger.log_metrics(logs) # 步骤7的漂移检测 from alibi_detect.cd import mmddrift detector = mmddrift( x_train, backend="tensorflow", p_val=0.05 ) drift_preds = detector.predict(x_prod)
技术跟踪:
技能扩展:
跨界融合:
伦理实践:
到此这篇关于python transformers库【nlp处理库】全面讲解的文章就介绍到这了,更多相关python transformers库内容请搜索代码网以前的文章或继续浏览下面的相关文章希望大家以后多多支持代码网!
您想发表意见!!点此发布评论
版权声明:本文内容由互联网用户贡献,该文观点仅代表作者本人。本站仅提供信息存储服务,不拥有所有权,不承担相关法律责任。 如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至 2386932994@qq.com 举报,一经查实将立刻删除。
发表评论