| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352 |
- # Natural Language Toolkit: Chunk parsing API
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Named entity chunker
- """
- import os, re, pickle
- from xml.etree import ElementTree as ET
- from nltk.tag import ClassifierBasedTagger, pos_tag
- try:
- from nltk.classify import MaxentClassifier
- except ImportError:
- pass
- from nltk.tree import Tree
- from nltk.tokenize import word_tokenize
- from nltk.data import find
- from nltk.chunk.api import ChunkParserI
- from nltk.chunk.util import ChunkScore
- class NEChunkParserTagger(ClassifierBasedTagger):
- """
- The IOB tagger used by the chunk parser.
- """
- def __init__(self, train):
- ClassifierBasedTagger.__init__(
- self, train=train, classifier_builder=self._classifier_builder
- )
- def _classifier_builder(self, train):
- return MaxentClassifier.train(
- train, algorithm="megam", gaussian_prior_sigma=1, trace=2
- )
- def _english_wordlist(self):
- try:
- wl = self._en_wordlist
- except AttributeError:
- from nltk.corpus import words
- self._en_wordlist = set(words.words("en-basic"))
- wl = self._en_wordlist
- return wl
- def _feature_detector(self, tokens, index, history):
- word = tokens[index][0]
- pos = simplify_pos(tokens[index][1])
- if index == 0:
- prevword = prevprevword = None
- prevpos = prevprevpos = None
- prevshape = prevtag = prevprevtag = None
- elif index == 1:
- prevword = tokens[index - 1][0].lower()
- prevprevword = None
- prevpos = simplify_pos(tokens[index - 1][1])
- prevprevpos = None
- prevtag = history[index - 1][0]
- prevshape = prevprevtag = None
- else:
- prevword = tokens[index - 1][0].lower()
- prevprevword = tokens[index - 2][0].lower()
- prevpos = simplify_pos(tokens[index - 1][1])
- prevprevpos = simplify_pos(tokens[index - 2][1])
- prevtag = history[index - 1]
- prevprevtag = history[index - 2]
- prevshape = shape(prevword)
- if index == len(tokens) - 1:
- nextword = nextnextword = None
- nextpos = nextnextpos = None
- elif index == len(tokens) - 2:
- nextword = tokens[index + 1][0].lower()
- nextpos = tokens[index + 1][1].lower()
- nextnextword = None
- nextnextpos = None
- else:
- nextword = tokens[index + 1][0].lower()
- nextpos = tokens[index + 1][1].lower()
- nextnextword = tokens[index + 2][0].lower()
- nextnextpos = tokens[index + 2][1].lower()
- # 89.6
- features = {
- "bias": True,
- "shape": shape(word),
- "wordlen": len(word),
- "prefix3": word[:3].lower(),
- "suffix3": word[-3:].lower(),
- "pos": pos,
- "word": word,
- "en-wordlist": (word in self._english_wordlist()),
- "prevtag": prevtag,
- "prevpos": prevpos,
- "nextpos": nextpos,
- "prevword": prevword,
- "nextword": nextword,
- "word+nextpos": "{0}+{1}".format(word.lower(), nextpos),
- "pos+prevtag": "{0}+{1}".format(pos, prevtag),
- "shape+prevtag": "{0}+{1}".format(prevshape, prevtag),
- }
- return features
- class NEChunkParser(ChunkParserI):
- """
- Expected input: list of pos-tagged words
- """
- def __init__(self, train):
- self._train(train)
- def parse(self, tokens):
- """
- Each token should be a pos-tagged word
- """
- tagged = self._tagger.tag(tokens)
- tree = self._tagged_to_parse(tagged)
- return tree
- def _train(self, corpus):
- # Convert to tagged sequence
- corpus = [self._parse_to_tagged(s) for s in corpus]
- self._tagger = NEChunkParserTagger(train=corpus)
- def _tagged_to_parse(self, tagged_tokens):
- """
- Convert a list of tagged tokens to a chunk-parse tree.
- """
- sent = Tree("S", [])
- for (tok, tag) in tagged_tokens:
- if tag == "O":
- sent.append(tok)
- elif tag.startswith("B-"):
- sent.append(Tree(tag[2:], [tok]))
- elif tag.startswith("I-"):
- if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
- sent[-1].append(tok)
- else:
- sent.append(Tree(tag[2:], [tok]))
- return sent
- @staticmethod
- def _parse_to_tagged(sent):
- """
- Convert a chunk-parse tree to a list of tagged tokens.
- """
- toks = []
- for child in sent:
- if isinstance(child, Tree):
- if len(child) == 0:
- print("Warning -- empty chunk in sentence")
- continue
- toks.append((child[0], "B-{0}".format(child.label())))
- for tok in child[1:]:
- toks.append((tok, "I-{0}".format(child.label())))
- else:
- toks.append((child, "O"))
- return toks
- def shape(word):
- if re.match("[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
- return "number"
- elif re.match("\W+$", word, re.UNICODE):
- return "punct"
- elif re.match("\w+$", word, re.UNICODE):
- if word.istitle():
- return "upcase"
- elif word.islower():
- return "downcase"
- else:
- return "mixedcase"
- else:
- return "other"
- def simplify_pos(s):
- if s.startswith("V"):
- return "V"
- else:
- return s.split("-")[0]
- def postag_tree(tree):
- # Part-of-speech tagging.
- words = tree.leaves()
- tag_iter = (pos for (word, pos) in pos_tag(words))
- newtree = Tree("S", [])
- for child in tree:
- if isinstance(child, Tree):
- newtree.append(Tree(child.label(), []))
- for subchild in child:
- newtree[-1].append((subchild, next(tag_iter)))
- else:
- newtree.append((child, next(tag_iter)))
- return newtree
- def load_ace_data(roots, fmt="binary", skip_bnews=True):
- for root in roots:
- for root, dirs, files in os.walk(root):
- if root.endswith("bnews") and skip_bnews:
- continue
- for f in files:
- if f.endswith(".sgm"):
- for sent in load_ace_file(os.path.join(root, f), fmt):
- yield sent
- def load_ace_file(textfile, fmt):
- print(" - {0}".format(os.path.split(textfile)[1]))
- annfile = textfile + ".tmx.rdc.xml"
- # Read the xml file, and get a list of entities
- entities = []
- with open(annfile, "r") as infile:
- xml = ET.parse(infile).getroot()
- for entity in xml.findall("document/entity"):
- typ = entity.find("entity_type").text
- for mention in entity.findall("entity_mention"):
- if mention.get("TYPE") != "NAME":
- continue # only NEs
- s = int(mention.find("head/charseq/start").text)
- e = int(mention.find("head/charseq/end").text) + 1
- entities.append((s, e, typ))
- # Read the text file, and mark the entities.
- with open(textfile, "r") as infile:
- text = infile.read()
- # Strip XML tags, since they don't count towards the indices
- text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
- # Blank out anything before/after <TEXT>
- def subfunc(m):
- return " " * (m.end() - m.start() - 6)
- text = re.sub("[\s\S]*<TEXT>", subfunc, text)
- text = re.sub("</TEXT>[\s\S]*", "", text)
- # Simplify quotes
- text = re.sub("``", ' "', text)
- text = re.sub("''", '" ', text)
- entity_types = set(typ for (s, e, typ) in entities)
- # Binary distinction (NE or not NE)
- if fmt == "binary":
- i = 0
- toks = Tree("S", [])
- for (s, e, typ) in sorted(entities):
- if s < i:
- s = i # Overlapping! Deal with this better?
- if e <= s:
- continue
- toks.extend(word_tokenize(text[i:s]))
- toks.append(Tree("NE", text[s:e].split()))
- i = e
- toks.extend(word_tokenize(text[i:]))
- yield toks
- # Multiclass distinction (NE type)
- elif fmt == "multiclass":
- i = 0
- toks = Tree("S", [])
- for (s, e, typ) in sorted(entities):
- if s < i:
- s = i # Overlapping! Deal with this better?
- if e <= s:
- continue
- toks.extend(word_tokenize(text[i:s]))
- toks.append(Tree(typ, text[s:e].split()))
- i = e
- toks.extend(word_tokenize(text[i:]))
- yield toks
- else:
- raise ValueError("bad fmt value")
- # This probably belongs in a more general-purpose location (as does
- # the parse_to_tagged function).
- def cmp_chunks(correct, guessed):
- correct = NEChunkParser._parse_to_tagged(correct)
- guessed = NEChunkParser._parse_to_tagged(guessed)
- ellipsis = False
- for (w, ct), (w, gt) in zip(correct, guessed):
- if ct == gt == "O":
- if not ellipsis:
- print(" {:15} {:15} {2}".format(ct, gt, w))
- print(" {:15} {:15} {2}".format("...", "...", "..."))
- ellipsis = True
- else:
- ellipsis = False
- print(" {:15} {:15} {2}".format(ct, gt, w))
- def build_model(fmt="binary"):
- print("Loading training data...")
- train_paths = [
- find("corpora/ace_data/ace.dev"),
- find("corpora/ace_data/ace.heldout"),
- find("corpora/ace_data/bbn.dev"),
- find("corpora/ace_data/muc.dev"),
- ]
- train_trees = load_ace_data(train_paths, fmt)
- train_data = [postag_tree(t) for t in train_trees]
- print("Training...")
- cp = NEChunkParser(train_data)
- del train_data
- print("Loading eval data...")
- eval_paths = [find("corpora/ace_data/ace.eval")]
- eval_trees = load_ace_data(eval_paths, fmt)
- eval_data = [postag_tree(t) for t in eval_trees]
- print("Evaluating...")
- chunkscore = ChunkScore()
- for i, correct in enumerate(eval_data):
- guess = cp.parse(correct.leaves())
- chunkscore.score(correct, guess)
- if i < 3:
- cmp_chunks(correct, guess)
- print(chunkscore)
- outfilename = "/tmp/ne_chunker_{0}.pickle".format(fmt)
- print("Saving chunker to {0}...".format(outfilename))
- with open(outfilename, "wb") as outfile:
- pickle.dump(cp, outfile, -1)
- return cp
- if __name__ == "__main__":
- # Make sure that the pickled object has the right class name:
- from nltk.chunk.named_entity import build_model
- build_model("binary")
- build_model("multiclass")
|