在spaCy的管道末尾添加Span Categorizer是没有作用的。这是因为在pipe的最后一步,最后一个组件返回的不是一个Doc或一个Span,而是一个单词标记(Token)。因此,它不可能创建一个覆盖整个文档的Span对象。
有两种方法可以解决这个问题:
import spacy
from spacy import displacy
from spacy.pipeline import Pipe
from spacy.lang.en import English
class MySpanCategorizer(Pipe):
def __init__(self, nlp, labels=[]):
self.labels = labels
self.model = None
self.nlp = nlp
self.batch_size = 1
def add_label(self, label):
self.labels.append(label)
def predict(self, docs):
for doc in docs:
spans = []
for start, end in zip(range(len(doc)), range(1, len(doc)+1)):
span = doc[start:end]
spans.append(span)
for span in spans:
span_cats = {}
for label in self.labels:
# Set score to 0.5 for all labels
span_cats[label] = 0.5
span.set_extension('cats', default={}, force=True)
span._.cats = span_cats
return docs
nlp = English()
span_cat = MySpanCategorizer(nlp, ['PERSON', 'ORG'])
nlp.add_pipe(span_cat, last=True)
doc = nlp("George Washington was the first president of the United States.")
displacy.render(doc, style='dep', jupyter=True)