94人参与 • 2025-04-25 • Python
以下是一份关于 transformers 库的全面讲解,包含基础知识、高级用法、案例代码及学习路径。内容经过组织,适合不同阶段的学习者。
tokenizer:文本分词与编码model:神经网络模型架构pipeline:快速推理的封装接口pip install transformers torch datasets
from transformers import pipeline
# 使用情感分析流水线
classifier = pipeline("sentiment-analysis")
result = classifier("i love programming with transformers!")
print(result) # [{'label': 'positive', 'score': 0.9998}]from transformers import autotokenizer
tokenizer = autotokenizer.from_pretrained("bert-base-uncased")
text = "hello, world!"
encoded = tokenizer(text,
padding=true,
truncation=true,
return_tensors="pt") # 返回pytorch张量
print(encoded)
# {'input_ids': tensor([[101, 7592, 1010, 2088, 999, 102]]),
# 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}from transformers import automodel
model = automodel.from_pretrained("bert-base-uncased")
outputs = model(**encoded) # 前向传播
last_hidden_states = outputs.last_hidden_statefrom transformers import bertforsequenceclassification, trainer, trainingarguments
from datasets import load_dataset
# 加载数据集
dataset = load_dataset("imdb")
tokenized_datasets = dataset.map(
lambda x: tokenizer(x["text"], padding=true, truncation=true),
batched=true
)
# 定义模型
model = bertforsequenceclassification.from_pretrained("bert-base-uncased", num_labels=2)
# 训练参数配置
training_args = trainingarguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=8,
evaluation_strategy="epoch"
)
# 训练器配置
trainer = trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"]
)
# 开始训练
trainer.train()model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")
# 加载自定义模型
new_model = automodel.from_pretrained("./my_model")from transformers import bertmodel, berttokenizer
import torch
model = bertmodel.from_pretrained("bert-base-uncased", output_attentions=true)
inputs = tokenizer("the cat sat on the mat", return_tensors="pt")
outputs = model(**inputs)
# 提取第0层的注意力权重
attention = outputs.attentions[0][0]
print(attention.shape) # [num_heads, seq_len, seq_len]from transformers import trainingarguments
training_args = trainingarguments(
fp16=true, # 启用混合精度
...
)from transformers import pipeline
# 加载ner流水线
ner_pipeline = pipeline("ner", model="dslim/bert-base-ner")
text = "apple was founded by steve jobs in cupertino."
results = ner_pipeline(text)
# 结果可视化
for entity in results:
print(f"{entity['word']} -> {entity['entity']} (confidence: {entity['score']:.2f})")入门阶段:
pipeline 和基础模型使用中级阶段:
高级阶段:
必读论文:
实践项目:
社区资源:
在训练过程中动态调整学习率,防止梯度爆炸:
from transformers import trainingarguments
training_args = trainingarguments(
output_dir="./results",
learning_rate=2e-5,
weight_decay=0.01,
warmup_steps=500, # 学习率预热步数
gradient_accumulation_steps=2, # 梯度累积(节省显存)
gradient_clipping=1.0, # 梯度裁剪阈值
...
)import torch
from transformers import bertforsequenceclassification
class custommodel(bertforsequenceclassification):
def __init__(self, config):
super().__init__(config)
def forward(self, input_ids, attention_mask, labels=none):
outputs = super().forward(input_ids, attention_mask)
logits = outputs.logits
if labels is not none:
loss_fct = torch.nn.crossentropyloss(weight=torch.tensor([1.0, 2.0])) # 类别权重
loss = loss_fct(logits.view(-1, 2), labels.view(-1))
return {"loss": loss, "logits": logits}
return outputsfrom transformers import gpt2lmheadmodel, gpt2tokenizer
tokenizer = gpt2tokenizer.from_pretrained("gpt2")
model = gpt2lmheadmodel.from_pretrained("gpt2")
prompt = "in a world where ai dominates,"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
# 生成文本(配置生成参数)
output = model.generate(
input_ids,
max_length=100,
temperature=0.7, # 控制随机性(低值更确定)
top_k=50, # 限制候选词数量
num_return_sequences=3 # 生成3个不同结果
)
for seq in output:
print(tokenizer.decode(seq, skip_special_tokens=true))from transformers import pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
context = """
hugging face is a company based in new york city.
its transformers library is widely used in nlp.
"""
question = "where is hugging face located?"
result = qa_pipeline(question=question, context=context)
print(f"answer: {result['answer']} (score: {result['score']:.2f})")
# answer: new york city (score: 0.92)from transformers import bertmodel, autotokenizer
import torch
model = bertmodel.from_pretrained("bert-base-uncased")
quantized_model = torch.quantization.quantize_dynamic(
model,
{torch.nn.linear}, # 量化所有线性层
dtype=torch.qint8
)
# 量化后推理速度提升2-4倍,模型体积减少约75%from transformers import berttokenizer, bertforsequenceclassification
from torch.onnx import export
model = bertforsequenceclassification.from_pretrained("bert-base-uncased")
tokenizer = berttokenizer.from_pretrained("bert-base-uncased")
# 示例输入
dummy_input = tokenizer("this is a test", return_tensors="pt")
# 导出为onnx
export(
model,
(dummy_input["input_ids"], dummy_input["attention_mask"]),
"model.onnx",
opset_version=13,
input_names=["input_ids", "attention_mask"],
output_names=["logits"],
dynamic_axes={"input_ids": {0: "batch"}, "attention_mask": {0: "batch"}}
)import torch
# 在训练循环中插入显存监控
print(f"allocated: {torch.cuda.memory_allocated() / 1e9:.2f} gb")
print(f"cached: {torch.cuda.memory_reserved() / 1e9:.2f} gb")from torch.profiler import profile, record_function, profileractivity
with profile(activities=[profileractivity.cuda], record_shapes=true) as prof:
outputs = model(**inputs)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))from transformers import mbartforconditionalgeneration, mbart50tokenizerfast
model = mbartforconditionalgeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = mbart50tokenizerfast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
# 中文转英文
tokenizer.src_lang = "zh_cn"
text = "欢迎使用transformers库"
encoded = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.lang_code_to_id["en_xx"])
print(tokenizer.batch_decode(generated_tokens, skip_special_tokens=true))
# ['welcome to the transformers library']from pil import image
from transformers import clipprocessor, clipmodel
model = clipmodel.from_pretrained("openai/clip-vit-base-patch32")
processor = clipprocessor.from_pretrained("openai/clip-vit-base-patch32")
image = image.open("cat.jpg")
text = ["a photo of a cat", "a photo of a dog"]
inputs = processor(text=text, images=image, return_tensors="pt", padding=true)
outputs = model(**inputs)
# 计算图文相似度
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1) # 概率分布实现一个简化版 transformer:
import torch.nn as nn
class transformerblock(nn.module):
def __init__(self, d_model=512, nhead=8):
super().__init__()
self.attention = nn.multiheadattention(d_model, nhead)
self.linear = nn.linear(d_model, d_model)
self.norm = nn.layernorm(d_model)
def forward(self, x):
attn_output, _ = self.attention(x, x, x)
x = x + attn_output
x = self.norm(x)
x = x + self.linear(x)
return x解决方案:
batch_sizegradient_accumulation_steps)fp16=true)torch.cuda.empty_cache()from transformers import berttokenizer
tokenizer = berttokenizer.from_pretrained("bert-base-chinese")
# 手动添加特殊词汇
tokenizer.add_tokens(["【特殊词】"])
# 调整模型嵌入层
model.resize_token_embeddings(len(tokenizer)) 以下继续扩展关于 transformers 库的深度应用内容,涵盖更多实际场景、前沿技术及工业级实践方案。
from transformers import llamaforcausallm, llamatokenizer, trainingarguments
# 加载模型和分词器(需申请权限)
model = llamaforcausallm.from_pretrained("decapoda-research/llama-7b-hf")
tokenizer = llamatokenizer.from_pretrained("decapoda-research/llama-7b-hf")
# 低秩适配(lora)微调
from peft import get_peft_model, loraconfig
lora_config = loraconfig(
r=8, # 低秩维度
lora_alpha=32,
target_modules=["q_proj", "v_proj"], # 仅微调部分模块
lora_dropout=0.05,
bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # 显示可训练参数占比(通常 <1%)
# 继续配置训练参数...# 使用 trl 库进行 rlhf 训练
from trl import ppotrainer, automodelforcausallmwithvaluehead
model = automodelforcausallmwithvaluehead.from_pretrained("gpt2")
ppo_trainer = ppotrainer(
model=model,
config=training_args,
dataset=dataset,
tokenizer=tokenizer
)
# 定义奖励模型
for epoch in range(3):
for batch in ppo_trainer.dataloader:
# 生成响应
response_tensors = model.generate(batch["input_ids"])
# 计算奖励(需自定义奖励函数)
rewards = calculate_rewards(response_tensors, batch)
# ppo 优化步骤
ppo_trainer.step(
response_tensors,
rewards,
batch["attention_mask"]
)from transformers import trainingarguments
# 配置分布式训练
training_args = trainingarguments(
per_device_train_batch_size=4,
gradient_accumulation_steps=8,
fp16=true,
tpu_num_cores=8, # 使用tpu时指定核心数
dataloader_num_workers=4,
deepspeed="./configs/deepspeed_config.json" # 使用deepspeed优化
)
# deepspeed 配置文件示例(ds_config.json):
{
"fp16": {
"enabled": true
},
"optimizer": {
"type": "adamw",
"params": {
"lr": 3e-5
}
},
"zero_optimization": {
"stage": 3 # 启用zero-3优化
}
}from fastapi import fastapi
from pydantic import basemodel
from transformers import pipeline
app = fastapi()
generator = pipeline("text-generation", model="gpt2")
class request(basemodel):
text: str
max_length: int = 100
@app.post("/generate")
async def generate_text(request: request):
result = generator(request.text, max_length=request.max_length)
return {"generated_text": result[0]["generated_text"]}
# 启动服务:uvicorn main:app --port 8000from transformers import autotokenizer, automodelforquestionanswering
tokenizer = autotokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = automodelforquestionanswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
def process_long_text(context, question, max_length=384, stride=128):
# 分块处理长文本
inputs = tokenizer(
question,
context,
max_length=max_length,
truncation="only_second",
stride=stride,
return_overflowing_tokens=true,
return_offsets_mapping=true
)
# 对各块推理并合并结果
best_score = 0
best_answer = ""
for i in range(len(inputs["input_ids"])):
outputs = model(**{k: torch.tensor([v[i]]) for k, v in inputs.items()})
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1
score = (outputs.start_logits[answer_start] + outputs.end_logits[answer_end-1]).item()
if score > best_score:
best_score = score
best_answer = tokenizer.decode(inputs["input_ids"][i][answer_start:answer_end])
return best_answer# 使用 xlm-roberta 进行跨语言迁移
from transformers import xlmrobertatokenizer, xlmrobertaforsequenceclassification
tokenizer = xlmrobertatokenizer.from_pretrained("xlm-roberta-base")
model = xlmrobertaforsequenceclassification.from_pretrained("xlm-roberta-base")
# 通过少量样本微调(代码与bert训练类似)from captum.attr import layerintegratedgradients
from transformers import bertforsequenceclassification
model = bertforsequenceclassification.from_pretrained("bert-base-uncased")
def forward_func(input_ids, attention_mask):
return model(input_ids, attention_mask).logits
lig = layerintegratedgradients(forward_func, model.bert.embeddings)
# 计算输入词重要性
attributions, delta = lig.attribute(
inputs=input_ids,
baselines=tokenizer.pad_token_id * torch.ones_like(input_ids),
additional_forward_args=attention_mask,
return_convergence_delta=true
)
# 可视化结果
import matplotlib.pyplot as plt
plt.bar(range(len(attributions[0])), attributions[0].detach().numpy())
plt.xticks(ticks=range(len(tokens)), labels=tokens, rotation=90)
plt.show()import spacy
from spacy_transformers import transformerslanguage, transformerswordpiecer
# 创建spacy管道
nlp = transformerslanguage(trf_name="bert-base-uncased")
# 自定义组件
@spacy.registry.architectures("customclassifier.v1")
def create_classifier(transformer, tok2vec, n_classes):
return transformerstextcategorizer(transformer, tok2vec, n_classes)
# 在spacy中直接使用transformer模型
doc = nlp("this is a text to analyze.")
print(doc._.trf_last_hidden_state.shape) # [seq_len, hidden_dim]import gradio as gr
from transformers import pipeline
ner_pipeline = pipeline("ner")
def extract_entities(text):
results = ner_pipeline(text)
return {"text": text, "entities": [
{"entity": res["entity"], "start": res["start"], "end": res["end"]}
for res in results
]}
gr.interface(
fn=extract_entities,
inputs=gr.textbox(lines=5),
outputs=gr.highlightedtext()
).launch()跟踪最新进展:
实战项目进阶:
系统优化方向:
以下继续扩展关于 transformers 库的终极实践指南,涵盖生产级优化、前沿模型架构、领域专用方案及伦理考量。
# 使用 nn_pruning 进行结构化剪枝
from transformers import bertforsequenceclassification
from nn_pruning import modelpruning
model = bertforsequenceclassification.from_pretrained("bert-base-uncased")
pruner = modelpruning(
model,
target_sparsity=0.5, # 剪枝50%的注意力头
pattern="block_sparse" # 结构化剪枝模式
)
# 执行剪枝并微调
pruned_model = pruner.prune()
pruned_model.save_pretrained("./pruned_bert")
# 知识蒸馏(教师→学生模型)
from transformers import distilbertforsequenceclassification, distilberttokenizer
teacher = bertforsequenceclassification.from_pretrained("bert-base-uncased")
student = distilbertforsequenceclassification.from_pretrained("distilbert-base-uncased")
# 使用蒸馏训练器
from transformers import distillationtrainingarguments, distillationtrainer
training_args = distillationtrainingarguments(
output_dir="./distilled",
temperature=2.0, # 软化概率分布
alpha_ce=0.5, # 交叉熵损失权重
alpha_mse=0.5 # 隐藏层mse损失权重
)
trainer = distillationtrainer(
teacher=teacher,
student=student,
args=training_args,
train_dataset=tokenized_datasets["train"],
tokenizer=tokenizer
)
trainer.train()# 转换模型为tensorrt引擎 trtexec --onnx=model.onnx --saveengine=model.trt --fp16
# python 调用tensorrt引擎
import tensorrt as trt
import pycuda.driver as cuda
runtime = trt.runtime(trt.logger(trt.logger.warning))
with open("model.trt", "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
# 绑定输入输出缓冲区进行推理from transformers import autotokenizer, automodelfortokenclassification
tokenizer = autotokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = automodelfortokenclassification.from_pretrained("dmis-lab/biobert-v1.1")
text = "the patient exhibited egfr mutations and responded to osimertinib."
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs).logits
# 提取基因实体
predictions = torch.argmax(outputs, dim=2)
print([tokenizer.decode([token]) for token in inputs.input_ids[0]])
print(predictions.tolist()) # bio标注结果# 合同条款分类
from transformers import berttokenizer, bertforsequenceclassification
tokenizer = berttokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = bertforsequenceclassification.from_pretrained("nlpaueb/legal-bert-base-uncased")
clause = "the parties hereby agree to arbitrate all disputes in accordance with icc rules."
inputs = tokenizer(clause, return_tensors="pt", truncation=true, padding=true)
outputs = model(**inputs)
predicted_class = torch.argmax(outputs.logits).item() # 0: 仲裁条款, 1: 保密条款等from transformers import bertforsequenceclassification
import coremltools as ct
model = bertforsequenceclassification.from_pretrained("bert-base-uncased")
tokenizer = berttokenizer.from_pretrained("bert-base-uncased")
# 转换模型
traced_model = torch.jit.trace(model, (input_ids, attention_mask))
mlmodel = ct.convert(
traced_model,
inputs=[
ct.tensortype(name="input_ids", shape=input_ids.shape),
ct.tensortype(name="attention_mask", shape=attention_mask.shape)
]
)
mlmodel.save("bertsenti.mlmodel")from transformers import tfbertforsequenceclassification
import tensorflow as tf
model = tfbertforsequenceclassification.from_pretrained("bert-base-uncased")
# 转换为tflite
converter = tf.lite.tfliteconverter.from_keras_model(model)
converter.optimizations = [tf.lite.optimize.default] # 动态范围量化
tflite_model = converter.convert()
with open("model_quant.tflite", "wb") as f:
f.write(tflite_model)from transformers import pipeline
from fairness_metrics import demographic_parity
# 检测模型偏见
classifier = pipeline("text-classification", model="bert-base-uncased")
protected_groups = {
"gender": ["she", "he"],
"race": ["african", "european"]
}
bias_scores = {}
for category, terms in protected_groups.items():
texts = [f"{term} is qualified for this position" for term in terms]
results = classifier(texts)
bias_scores[category] = demographic_parity(results)from textattack import attackrecipe
from textattack.models.wrappers import huggingfacemodelwrapper
model_wrapper = huggingfacemodelwrapper(model, tokenizer)
attack = attackrecipe.build("bae") # bae攻击方法
# 生成对抗样本
attack_args = textattack.attackargs(num_examples=5)
attacker = textattack.attacker(attack, model_wrapper, attack_args)
attack_results = attacker.attack_dataset(dataset)from transformers import longformermodel
model = longformermodel.from_pretrained("allenai/longformer-base-4096")
inputs = tokenizer("this is a very long document..."*1000, return_tensors="pt")
outputs = model(**inputs) # 支持最长4096 tokens# 使用switch transformers
from transformers import switchtransformersforconditionalgeneration
model = switchtransformersforconditionalgeneration.from_pretrained("google/switch-base-8")
outputs = model.generate(
input_ids,
expert_choice_mask=true, # 追踪专家路由
)
print(outputs.expert_choices) # 显示每个token使用的专家"""
端到端文本分类系统架构:
1. 数据采集 → 2. 清洗 → 3. 标注 → 4. 模型训练 → 5. 评估 → 6. 部署 → 7. 监控
"""
# 步骤4的增强训练流程
from transformers import trainercallback
class customcallback(trainercallback):
def on_log(self, args, state, control, logs=none, **kwargs):
# 实时记录指标到prometheus
prometheus_logger.log_metrics(logs)
# 步骤7的漂移检测
from alibi_detect.cd import mmddrift
detector = mmddrift(
x_train,
backend="tensorflow",
p_val=0.05
)
drift_preds = detector.predict(x_prod)技术跟踪:
技能扩展:
跨界融合:
伦理实践:
到此这篇关于python transformers库【nlp处理库】全面讲解的文章就介绍到这了,更多相关python transformers库内容请搜索代码网以前的文章或继续浏览下面的相关文章希望大家以后多多支持代码网!
您想发表意见!!点此发布评论
版权声明:本文内容由互联网用户贡献,该文观点仅代表作者本人。本站仅提供信息存储服务,不拥有所有权,不承担相关法律责任。 如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至 2386932994@qq.com 举报,一经查实将立刻删除。
发表评论