英文:
TokenClassificationChunkPipeline is throwing error: 'BatchEncoding' object is not an iterator
问题
I can only provide translations for the non-code parts of your text. Please let me know if you would like me to translate the non-code content.
英文:
Following this HuggingFace Anonymisation Tutorial.
Using pytorch 2.0.0 and transformers-4.28.1
Running the code as it is, I get an error over the custom pipeline:
def anonymize(text):
ents = pipe(text) # this errors out
...
TypeError: 'BatchEncoding' object is not an iterator
I realise it's a tokenizer issue,
class TokenClassificationChunkPipeline(TokenClassificationPipeline):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def preprocess(self, sentence, offset_mapping=None):
model_inputs = self.tokenizer(
sentence,
return_tensors="pt",
truncation=True,
return_special_tokens_mask=True,
return_offsets_mapping=True,
return_overflowing_tokens=True, # Return multiple chunks
max_length=self.tokenizer.model_max_length,
padding=True
)
if offset_mapping:
model_inputs["offset_mapping"] = offset_mapping
model_inputs["sentence"] = sentence
return model_inputs
This model_inputs is a
> <class 'transformers.tokenization_utils_base.BatchEncoding'>
How can I make an iterator BatchEncoding object?
Else, is there another way?
For full code, please visit the tutorial link above.
答案1
得分: 2
以下是您要翻译的部分:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers.pipelines.token_classification import TokenClassificationPipeline
model_checkpoint = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
class TokenClassificationChunkPipeline(TokenClassificationPipeline):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
tokenizer_params = preprocess_params.pop("tokenizer_params", {})
truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
inputs = self.tokenizer(
sentence,
return_tensors="pt",
truncation=True,
return_special_tokens_mask=True,
return_offsets_mapping=True,
return_overflowing_tokens=True, # Return multiple chunks
max_length=self.tokenizer.model_max_length,
padding=True
)
#inputs.pop("overflow_to_sample_mapping", None)
num_chunks = len(inputs["input_ids"])
for i in range(num_chunks):
if self.framework == "tf":
model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
else:
model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
if offset_mapping is not None:
model_inputs["offset_mapping"] = offset_mapping
model_inputs["sentence"] = sentence if i == 0 else None
model_inputs["is_last"] = i == num_chunks - 1
yield model_inputs
def _forward(self, model_inputs):
# Forward
special_tokens_mask = model_inputs.pop("special_tokens_mask")
offset_mapping = model_inputs.pop("offset_mapping", None)
sentence = model_inputs.pop("sentence")
is_last = model_inputs.pop("is_last")
overflow_to_sample_mapping = model_inputs.pop("overflow_to_sample_mapping")
output = self.model(**model_inputs)
logits = output["logits"] if isinstance(output, dict) else output[0]
model_outputs = {
"logits": logits,
"special_tokens_mask": special_tokens_mask,
"offset_mapping": offset_mapping,
"sentence": sentence,
"overflow_to_sample_mapping": overflow_to_sample_mapping,
"is_last": is_last,
**model_inputs,
}
# We reshape outputs to fit with the postprocess inputs
model_outputs["input_ids"] = torch.reshape(model_outputs["input_ids"], (1, -1))
model_outputs["token_type_ids"] = torch.reshape(model_outputs["token_type_ids"], (1, -1))
model_outputs["attention_mask"] = torch.reshape(model_outputs["attention_mask"], (1, -1))
model_outputs["special_tokens_mask"] = torch.reshape(model_outputs["special_tokens_mask"], (1, -1))
model_outputs["offset_mapping"] = torch.reshape(model_outputs["offset_mapping"], (1, -1, 2))
return model_outputs
pipe = TokenClassificationChunkPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple")
pipe("Bernard works at BNP Paribas in Paris.")
[out]:
[{'entity_group': 'PER',
'score': 0.9994497,
'word': 'Bernard',
'start': 0,
'end': 7},
{'entity_group': 'ORG',
'score': 0.9997708,
'word': 'BNP Paribas',
'start': 17,
'end': 28},
{'entity_group': 'LOC',
'score': 0.99906,
'word': 'Paris',
'start': 32,
'end': 37}]
有关参考,请查看TokenClassificationPipeline
类中的preprocess()
和_forward()
函数的编码方式,链接在这里:https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/token_classification.py
英文:
Not sure why the pipeline was coded that way in the blogpost, but here's a working version:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers.pipelines.token_classification import TokenClassificationPipeline
model_checkpoint = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
class TokenClassificationChunkPipeline(TokenClassificationPipeline):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
tokenizer_params = preprocess_params.pop("tokenizer_params", {})
truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
inputs = self.tokenizer(
sentence,
return_tensors="pt",
truncation=True,
return_special_tokens_mask=True,
return_offsets_mapping=True,
return_overflowing_tokens=True, # Return multiple chunks
max_length=self.tokenizer.model_max_length,
padding=True
)
#inputs.pop("overflow_to_sample_mapping", None)
num_chunks = len(inputs["input_ids"])
for i in range(num_chunks):
if self.framework == "tf":
model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
else:
model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
if offset_mapping is not None:
model_inputs["offset_mapping"] = offset_mapping
model_inputs["sentence"] = sentence if i == 0 else None
model_inputs["is_last"] = i == num_chunks - 1
yield model_inputs
def _forward(self, model_inputs):
# Forward
special_tokens_mask = model_inputs.pop("special_tokens_mask")
offset_mapping = model_inputs.pop("offset_mapping", None)
sentence = model_inputs.pop("sentence")
is_last = model_inputs.pop("is_last")
overflow_to_sample_mapping = model_inputs.pop("overflow_to_sample_mapping")
output = self.model(**model_inputs)
logits = output["logits"] if isinstance(output, dict) else output[0]
model_outputs = {
"logits": logits,
"special_tokens_mask": special_tokens_mask,
"offset_mapping": offset_mapping,
"sentence": sentence,
"overflow_to_sample_mapping": overflow_to_sample_mapping,
"is_last": is_last,
**model_inputs,
}
# We reshape outputs to fit with the postprocess inputs
model_outputs["input_ids"] = torch.reshape(model_outputs["input_ids"], (1, -1))
model_outputs["token_type_ids"] = torch.reshape(model_outputs["token_type_ids"], (1, -1))
model_outputs["attention_mask"] = torch.reshape(model_outputs["attention_mask"], (1, -1))
model_outputs["special_tokens_mask"] = torch.reshape(model_outputs["special_tokens_mask"], (1, -1))
model_outputs["offset_mapping"] = torch.reshape(model_outputs["offset_mapping"], (1, -1, 2))
return model_outputs
pipe = TokenClassificationChunkPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple")
pipe("Bernard works at BNP Paribas in Paris.")
[out]:
[{'entity_group': 'PER',
'score': 0.9994497,
'word': 'Bernard',
'start': 0,
'end': 7},
{'entity_group': 'ORG',
'score': 0.9997708,
'word': 'BNP Paribas',
'start': 17,
'end': 28},
{'entity_group': 'LOC',
'score': 0.99906,
'word': 'Paris',
'start': 32,
'end': 37}]
For reference, take a look at how the preproces()
and the _forward()
functions are coded in the TokenClassificationPipeline
class, https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/token_classification.py
The preprocess should return a generator, that's why the _forward is expecting a generator and complains TypeError: 'BatchEncoding' object is not an iterator
.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论