From e07808a4808bef5760a2be77323e27705834b8d2 Mon Sep 17 00:00:00 2001 From: Nipun Sadvilkar Date: Wed, 29 Jun 2022 15:55:08 +0530 Subject: [PATCH] Update PySBD component to support spaCy v3 --- examples/pysbd_as_spacy_component.py | 23 ++++++++++---------- pysbd/about.py | 2 +- pysbd/utils.py | 32 ++++++++++++++++++---------- setup.py | 3 --- 4 files changed, 34 insertions(+), 26 deletions(-) diff --git a/examples/pysbd_as_spacy_component.py b/examples/pysbd_as_spacy_component.py index bd28ac7..e50e002 100644 --- a/examples/pysbd_as_spacy_component.py +++ b/examples/pysbd_as_spacy_component.py @@ -3,27 +3,28 @@ Installation: pip install spacy + +NOTE: Works with spacy>=3.x.x """ -import pysbd import spacy +from spacy.language import Language + +from pysbd.utils import PySBDFactory + + +@Language.factory("pysbd", default_config={"language": 'en'}) +def pysbd_component(nlp, name, language: str): + return PySBDFactory(nlp, language=language) -def pysbd_sentence_boundaries(doc): - seg = pysbd.Segmenter(language="en", clean=False, char_span=True) - sents_char_spans = seg.segment(doc.text) - char_spans = [doc.char_span(sent_span.start, sent_span.end) for sent_span in sents_char_spans] - start_token_ids = [span[0].idx for span in char_spans if span is not None] - for token in doc: - token.is_sent_start = True if token.idx in start_token_ids else False - return doc if __name__ == "__main__": text = "My name is Jonas E. Smith. Please turn to p. 55." nlp = spacy.blank('en') # add as a spacy pipeline component - nlp.add_pipe(pysbd_sentence_boundaries) + nlp.add_pipe("pysbd", first=True) doc = nlp(text) print('sent_id', 'sentence', sep='\t|\t') for sent_id, sent in enumerate(doc.sents, start=1): - print(sent_id, sent.text, sep='\t|\t') + print(sent_id, repr(sent.text), sep='\t|\t') diff --git a/pysbd/about.py b/pysbd/about.py index c8c9404..94de0ae 100644 --- a/pysbd/about.py +++ b/pysbd/about.py @@ -2,7 +2,7 @@ # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ __title__ = "pysbd" -__version__ = "0.3.4" +__version__ = "0.3.5" __summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages." __uri__ = "http://nipunsadvilkar.github.io/" __author__ = "Nipun Sadvilkar" diff --git a/pysbd/utils.py b/pysbd/utils.py index 41e6716..4f51d4d 100644 --- a/pysbd/utils.py +++ b/pysbd/utils.py @@ -3,15 +3,16 @@ import re import pysbd -class Rule(object): +class Rule(object): def __init__(self, pattern, replacement): self.pattern = pattern self.replacement = replacement def __repr__(self): # pragma: no cover return '<{} pattern="{}" and replacement="{}">'.format( - self.__class__.__name__, self.pattern, self.replacement) + self.__class__.__name__, self.pattern, self.replacement + ) class Text(str): @@ -30,6 +31,7 @@ class Text(str): input as it is if rule pattern doesnt match else replacing found pattern with replacement chars """ + def apply(self, *rules): for each_r in rules: self = re.sub(each_r.pattern, each_r.replacement, self) @@ -37,7 +39,6 @@ def apply(self, *rules): class TextSpan(object): - def __init__(self, sent, start, end): """ Sentence text and its start & end character offsets within original text @@ -57,25 +58,34 @@ def __init__(self, sent, start, end): def __repr__(self): # pragma: no cover return "{0}(sent={1}, start={2}, end={3})".format( - self.__class__.__name__, repr(self.sent), self.start, self.end) + self.__class__.__name__, repr(self.sent), self.start, self.end + ) def __eq__(self, other): if isinstance(self, other.__class__): - return self.sent == other.sent and self.start == other.start and self.end == other.end + return ( + self.sent == other.sent + and self.start == other.start + and self.end == other.end + ) class PySBDFactory(object): """pysbd as a spacy component through entrypoints""" - def __init__(self, nlp, language='en'): + def __init__(self, nlp, language="en"): self.nlp = nlp - self.seg = pysbd.Segmenter(language=language, clean=False, - char_span=True) + self.seg = pysbd.Segmenter(language=language, clean=False, char_span=True) def __call__(self, doc): sents_char_spans = self.seg.segment(doc.text_with_ws) - start_token_ids = [sent.start for sent in sents_char_spans] + sents_char_spans_doc = [ + doc.char_span(sent_span.start, sent_span.end, alignment_mode="contract") + for sent_span in sents_char_spans + ] + start_token_ids = [ + span[0].idx for span in sents_char_spans_doc if span is not None + ] for token in doc: - token.is_sent_start = (True if token.idx - in start_token_ids else False) + token.is_sent_start = True if token.idx in start_token_ids else False return doc diff --git a/setup.py b/setup.py index 16b192d..8024c6c 100644 --- a/setup.py +++ b/setup.py @@ -102,8 +102,5 @@ def run(self): # $ setup.py publish support. cmdclass={ 'upload': UploadCommand, - }, - entry_points={ - "spacy_factories": ["pysbd = pysbd.utils:PySBDFactory"] } )