util.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644
  1. # Natural Language Toolkit: Chunk format conversions
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com> (minor additions)
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. import re
  9. from nltk.tree import Tree
  10. from nltk.tag.mapping import map_tag
  11. from nltk.tag.util import str2tuple
  12. ##//////////////////////////////////////////////////////
  13. ## EVALUATION
  14. ##//////////////////////////////////////////////////////
  15. from nltk.metrics import accuracy as _accuracy
  16. def accuracy(chunker, gold):
  17. """
  18. Score the accuracy of the chunker against the gold standard.
  19. Strip the chunk information from the gold standard and rechunk it using
  20. the chunker, then compute the accuracy score.
  21. :type chunker: ChunkParserI
  22. :param chunker: The chunker being evaluated.
  23. :type gold: tree
  24. :param gold: The chunk structures to score the chunker on.
  25. :rtype: float
  26. """
  27. gold_tags = []
  28. test_tags = []
  29. for gold_tree in gold:
  30. test_tree = chunker.parse(gold_tree.flatten())
  31. gold_tags += tree2conlltags(gold_tree)
  32. test_tags += tree2conlltags(test_tree)
  33. # print 'GOLD:', gold_tags[:50]
  34. # print 'TEST:', test_tags[:50]
  35. return _accuracy(gold_tags, test_tags)
  36. # Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
  37. # -- statistics are evaluated only on demand, instead of at every sentence evaluation
  38. #
  39. # SB: use nltk.metrics for precision/recall scoring?
  40. #
  41. class ChunkScore(object):
  42. """
  43. A utility class for scoring chunk parsers. ``ChunkScore`` can
  44. evaluate a chunk parser's output, based on a number of statistics
  45. (precision, recall, f-measure, misssed chunks, incorrect chunks).
  46. It can also combine the scores from the parsing of multiple texts;
  47. this makes it significantly easier to evaluate a chunk parser that
  48. operates one sentence at a time.
  49. Texts are evaluated with the ``score`` method. The results of
  50. evaluation can be accessed via a number of accessor methods, such
  51. as ``precision`` and ``f_measure``. A typical use of the
  52. ``ChunkScore`` class is::
  53. >>> chunkscore = ChunkScore() # doctest: +SKIP
  54. >>> for correct in correct_sentences: # doctest: +SKIP
  55. ... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP
  56. ... chunkscore.score(correct, guess) # doctest: +SKIP
  57. >>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP
  58. F Measure: 0.823
  59. :ivar kwargs: Keyword arguments:
  60. - max_tp_examples: The maximum number actual examples of true
  61. positives to record. This affects the ``correct`` member
  62. function: ``correct`` will not return more than this number
  63. of true positive examples. This does *not* affect any of
  64. the numerical metrics (precision, recall, or f-measure)
  65. - max_fp_examples: The maximum number actual examples of false
  66. positives to record. This affects the ``incorrect`` member
  67. function and the ``guessed`` member function: ``incorrect``
  68. will not return more than this number of examples, and
  69. ``guessed`` will not return more than this number of true
  70. positive examples. This does *not* affect any of the
  71. numerical metrics (precision, recall, or f-measure)
  72. - max_fn_examples: The maximum number actual examples of false
  73. negatives to record. This affects the ``missed`` member
  74. function and the ``correct`` member function: ``missed``
  75. will not return more than this number of examples, and
  76. ``correct`` will not return more than this number of true
  77. negative examples. This does *not* affect any of the
  78. numerical metrics (precision, recall, or f-measure)
  79. - chunk_label: A regular expression indicating which chunks
  80. should be compared. Defaults to ``'.*'`` (i.e., all chunks).
  81. :type _tp: list(Token)
  82. :ivar _tp: List of true positives
  83. :type _fp: list(Token)
  84. :ivar _fp: List of false positives
  85. :type _fn: list(Token)
  86. :ivar _fn: List of false negatives
  87. :type _tp_num: int
  88. :ivar _tp_num: Number of true positives
  89. :type _fp_num: int
  90. :ivar _fp_num: Number of false positives
  91. :type _fn_num: int
  92. :ivar _fn_num: Number of false negatives.
  93. """
  94. def __init__(self, **kwargs):
  95. self._correct = set()
  96. self._guessed = set()
  97. self._tp = set()
  98. self._fp = set()
  99. self._fn = set()
  100. self._max_tp = kwargs.get("max_tp_examples", 100)
  101. self._max_fp = kwargs.get("max_fp_examples", 100)
  102. self._max_fn = kwargs.get("max_fn_examples", 100)
  103. self._chunk_label = kwargs.get("chunk_label", ".*")
  104. self._tp_num = 0
  105. self._fp_num = 0
  106. self._fn_num = 0
  107. self._count = 0
  108. self._tags_correct = 0.0
  109. self._tags_total = 0.0
  110. self._measuresNeedUpdate = False
  111. def _updateMeasures(self):
  112. if self._measuresNeedUpdate:
  113. self._tp = self._guessed & self._correct
  114. self._fn = self._correct - self._guessed
  115. self._fp = self._guessed - self._correct
  116. self._tp_num = len(self._tp)
  117. self._fp_num = len(self._fp)
  118. self._fn_num = len(self._fn)
  119. self._measuresNeedUpdate = False
  120. def score(self, correct, guessed):
  121. """
  122. Given a correctly chunked sentence, score another chunked
  123. version of the same sentence.
  124. :type correct: chunk structure
  125. :param correct: The known-correct ("gold standard") chunked
  126. sentence.
  127. :type guessed: chunk structure
  128. :param guessed: The chunked sentence to be scored.
  129. """
  130. self._correct |= _chunksets(correct, self._count, self._chunk_label)
  131. self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
  132. self._count += 1
  133. self._measuresNeedUpdate = True
  134. # Keep track of per-tag accuracy (if possible)
  135. try:
  136. correct_tags = tree2conlltags(correct)
  137. guessed_tags = tree2conlltags(guessed)
  138. except ValueError:
  139. # This exception case is for nested chunk structures,
  140. # where tree2conlltags will fail with a ValueError: "Tree
  141. # is too deeply nested to be printed in CoNLL format."
  142. correct_tags = guessed_tags = ()
  143. self._tags_total += len(correct_tags)
  144. self._tags_correct += sum(
  145. 1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
  146. )
  147. def accuracy(self):
  148. """
  149. Return the overall tag-based accuracy for all text that have
  150. been scored by this ``ChunkScore``, using the IOB (conll2000)
  151. tag encoding.
  152. :rtype: float
  153. """
  154. if self._tags_total == 0:
  155. return 1
  156. return self._tags_correct / self._tags_total
  157. def precision(self):
  158. """
  159. Return the overall precision for all texts that have been
  160. scored by this ``ChunkScore``.
  161. :rtype: float
  162. """
  163. self._updateMeasures()
  164. div = self._tp_num + self._fp_num
  165. if div == 0:
  166. return 0
  167. else:
  168. return self._tp_num / div
  169. def recall(self):
  170. """
  171. Return the overall recall for all texts that have been
  172. scored by this ``ChunkScore``.
  173. :rtype: float
  174. """
  175. self._updateMeasures()
  176. div = self._tp_num + self._fn_num
  177. if div == 0:
  178. return 0
  179. else:
  180. return self._tp_num / div
  181. def f_measure(self, alpha=0.5):
  182. """
  183. Return the overall F measure for all texts that have been
  184. scored by this ``ChunkScore``.
  185. :param alpha: the relative weighting of precision and recall.
  186. Larger alpha biases the score towards the precision value,
  187. while smaller alpha biases the score towards the recall
  188. value. ``alpha`` should have a value in the range [0,1].
  189. :type alpha: float
  190. :rtype: float
  191. """
  192. self._updateMeasures()
  193. p = self.precision()
  194. r = self.recall()
  195. if p == 0 or r == 0: # what if alpha is 0 or 1?
  196. return 0
  197. return 1 / (alpha / p + (1 - alpha) / r)
  198. def missed(self):
  199. """
  200. Return the chunks which were included in the
  201. correct chunk structures, but not in the guessed chunk
  202. structures, listed in input order.
  203. :rtype: list of chunks
  204. """
  205. self._updateMeasures()
  206. chunks = list(self._fn)
  207. return [c[1] for c in chunks] # discard position information
  208. def incorrect(self):
  209. """
  210. Return the chunks which were included in the guessed chunk structures,
  211. but not in the correct chunk structures, listed in input order.
  212. :rtype: list of chunks
  213. """
  214. self._updateMeasures()
  215. chunks = list(self._fp)
  216. return [c[1] for c in chunks] # discard position information
  217. def correct(self):
  218. """
  219. Return the chunks which were included in the correct
  220. chunk structures, listed in input order.
  221. :rtype: list of chunks
  222. """
  223. chunks = list(self._correct)
  224. return [c[1] for c in chunks] # discard position information
  225. def guessed(self):
  226. """
  227. Return the chunks which were included in the guessed
  228. chunk structures, listed in input order.
  229. :rtype: list of chunks
  230. """
  231. chunks = list(self._guessed)
  232. return [c[1] for c in chunks] # discard position information
  233. def __len__(self):
  234. self._updateMeasures()
  235. return self._tp_num + self._fn_num
  236. def __repr__(self):
  237. """
  238. Return a concise representation of this ``ChunkScoring``.
  239. :rtype: str
  240. """
  241. return "<ChunkScoring of " + repr(len(self)) + " chunks>"
  242. def __str__(self):
  243. """
  244. Return a verbose representation of this ``ChunkScoring``.
  245. This representation includes the precision, recall, and
  246. f-measure scores. For other information about the score,
  247. use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
  248. :rtype: str
  249. """
  250. return (
  251. "ChunkParse score:\n"
  252. + (" IOB Accuracy: {:5.1f}%%\n".format(self.accuracy() * 100))
  253. + (" Precision: {:5.1f}%%\n".format(self.precision() * 100))
  254. + (" Recall: {:5.1f}%%\n".format(self.recall() * 100))
  255. + (" F-Measure: {:5.1f}%%".format(self.f_measure() * 100))
  256. )
  257. # extract chunks, and assign unique id, the absolute position of
  258. # the first word of the chunk
  259. def _chunksets(t, count, chunk_label):
  260. pos = 0
  261. chunks = []
  262. for child in t:
  263. if isinstance(child, Tree):
  264. if re.match(chunk_label, child.label()):
  265. chunks.append(((count, pos), child.freeze()))
  266. pos += len(child.leaves())
  267. else:
  268. pos += 1
  269. return set(chunks)
  270. def tagstr2tree(
  271. s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
  272. ):
  273. """
  274. Divide a string of bracketted tagged text into
  275. chunks and unchunked tokens, and produce a Tree.
  276. Chunks are marked by square brackets (``[...]``). Words are
  277. delimited by whitespace, and each word should have the form
  278. ``text/tag``. Words that do not contain a slash are
  279. assigned a ``tag`` of None.
  280. :param s: The string to be converted
  281. :type s: str
  282. :param chunk_label: The label to use for chunk nodes
  283. :type chunk_label: str
  284. :param root_label: The label to use for the root of the tree
  285. :type root_label: str
  286. :rtype: Tree
  287. """
  288. WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")
  289. stack = [Tree(root_label, [])]
  290. for match in WORD_OR_BRACKET.finditer(s):
  291. text = match.group()
  292. if text[0] == "[":
  293. if len(stack) != 1:
  294. raise ValueError("Unexpected [ at char {:d}".format(match.start()))
  295. chunk = Tree(chunk_label, [])
  296. stack[-1].append(chunk)
  297. stack.append(chunk)
  298. elif text[0] == "]":
  299. if len(stack) != 2:
  300. raise ValueError("Unexpected ] at char {:d}".format(match.start()))
  301. stack.pop()
  302. else:
  303. if sep is None:
  304. stack[-1].append(text)
  305. else:
  306. word, tag = str2tuple(text, sep)
  307. if source_tagset and target_tagset:
  308. tag = map_tag(source_tagset, target_tagset, tag)
  309. stack[-1].append((word, tag))
  310. if len(stack) != 1:
  311. raise ValueError("Expected ] at char {:d}".format(len(s)))
  312. return stack[0]
  313. ### CONLL
  314. _LINE_RE = re.compile("(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
  315. def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
  316. """
  317. Return a chunk structure for a single sentence
  318. encoded in the given CONLL 2000 style string.
  319. This function converts a CoNLL IOB string into a tree.
  320. It uses the specified chunk types
  321. (defaults to NP, PP and VP), and creates a tree rooted at a node
  322. labeled S (by default).
  323. :param s: The CoNLL string to be converted.
  324. :type s: str
  325. :param chunk_types: The chunk types to be converted.
  326. :type chunk_types: tuple
  327. :param root_label: The node label to use for the root.
  328. :type root_label: str
  329. :rtype: Tree
  330. """
  331. stack = [Tree(root_label, [])]
  332. for lineno, line in enumerate(s.split("\n")):
  333. if not line.strip():
  334. continue
  335. # Decode the line.
  336. match = _LINE_RE.match(line)
  337. if match is None:
  338. raise ValueError("Error on line {:d}".format(lineno))
  339. (word, tag, state, chunk_type) = match.groups()
  340. # If it's a chunk type we don't care about, treat it as O.
  341. if chunk_types is not None and chunk_type not in chunk_types:
  342. state = "O"
  343. # For "Begin"/"Outside", finish any completed chunks -
  344. # also do so for "Inside" which don't match the previous token.
  345. mismatch_I = state == "I" and chunk_type != stack[-1].label()
  346. if state in "BO" or mismatch_I:
  347. if len(stack) == 2:
  348. stack.pop()
  349. # For "Begin", start a new chunk.
  350. if state == "B" or mismatch_I:
  351. chunk = Tree(chunk_type, [])
  352. stack[-1].append(chunk)
  353. stack.append(chunk)
  354. # Add the new word token.
  355. stack[-1].append((word, tag))
  356. return stack[0]
  357. def tree2conlltags(t):
  358. """
  359. Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
  360. Convert a tree to the CoNLL IOB tag format.
  361. :param t: The tree to be converted.
  362. :type t: Tree
  363. :rtype: list(tuple)
  364. """
  365. tags = []
  366. for child in t:
  367. try:
  368. category = child.label()
  369. prefix = "B-"
  370. for contents in child:
  371. if isinstance(contents, Tree):
  372. raise ValueError(
  373. "Tree is too deeply nested to be printed in CoNLL format"
  374. )
  375. tags.append((contents[0], contents[1], prefix + category))
  376. prefix = "I-"
  377. except AttributeError:
  378. tags.append((child[0], child[1], "O"))
  379. return tags
  380. def conlltags2tree(
  381. sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
  382. ):
  383. """
  384. Convert the CoNLL IOB format to a tree.
  385. """
  386. tree = Tree(root_label, [])
  387. for (word, postag, chunktag) in sentence:
  388. if chunktag is None:
  389. if strict:
  390. raise ValueError("Bad conll tag sequence")
  391. else:
  392. # Treat as O
  393. tree.append((word, postag))
  394. elif chunktag.startswith("B-"):
  395. tree.append(Tree(chunktag[2:], [(word, postag)]))
  396. elif chunktag.startswith("I-"):
  397. if (
  398. len(tree) == 0
  399. or not isinstance(tree[-1], Tree)
  400. or tree[-1].label() != chunktag[2:]
  401. ):
  402. if strict:
  403. raise ValueError("Bad conll tag sequence")
  404. else:
  405. # Treat as B-*
  406. tree.append(Tree(chunktag[2:], [(word, postag)]))
  407. else:
  408. tree[-1].append((word, postag))
  409. elif chunktag == "O":
  410. tree.append((word, postag))
  411. else:
  412. raise ValueError("Bad conll tag {0!r}".format(chunktag))
  413. return tree
  414. def tree2conllstr(t):
  415. """
  416. Return a multiline string where each line contains a word, tag and IOB tag.
  417. Convert a tree to the CoNLL IOB string format
  418. :param t: The tree to be converted.
  419. :type t: Tree
  420. :rtype: str
  421. """
  422. lines = [" ".join(token) for token in tree2conlltags(t)]
  423. return "\n".join(lines)
  424. ### IEER
  425. _IEER_DOC_RE = re.compile(
  426. r"<DOC>\s*"
  427. r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
  428. r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
  429. r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
  430. r"<BODY>\s*"
  431. r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
  432. r"<TEXT>(?P<text>.*?)</TEXT>\s*"
  433. r"</BODY>\s*</DOC>\s*",
  434. re.DOTALL,
  435. )
  436. _IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
  437. def _ieer_read_text(s, root_label):
  438. stack = [Tree(root_label, [])]
  439. # s will be None if there is no headline in the text
  440. # return the empty list in place of a Tree
  441. if s is None:
  442. return []
  443. for piece_m in re.finditer("<[^>]+>|[^\s<]+", s):
  444. piece = piece_m.group()
  445. try:
  446. if piece.startswith("<b_"):
  447. m = _IEER_TYPE_RE.match(piece)
  448. if m is None:
  449. print("XXXX", piece)
  450. chunk = Tree(m.group("type"), [])
  451. stack[-1].append(chunk)
  452. stack.append(chunk)
  453. elif piece.startswith("<e_"):
  454. stack.pop()
  455. # elif piece.startswith('<'):
  456. # print "ERROR:", piece
  457. # raise ValueError # Unexpected HTML
  458. else:
  459. stack[-1].append(piece)
  460. except (IndexError, ValueError):
  461. raise ValueError(
  462. "Bad IEER string (error at character {:d})".format(piece_m.start())
  463. )
  464. if len(stack) != 1:
  465. raise ValueError("Bad IEER string")
  466. return stack[0]
  467. def ieerstr2tree(
  468. s,
  469. chunk_types=[
  470. "LOCATION",
  471. "ORGANIZATION",
  472. "PERSON",
  473. "DURATION",
  474. "DATE",
  475. "CARDINAL",
  476. "PERCENT",
  477. "MONEY",
  478. "MEASURE",
  479. ],
  480. root_label="S",
  481. ):
  482. """
  483. Return a chunk structure containing the chunked tagged text that is
  484. encoded in the given IEER style string.
  485. Convert a string of chunked tagged text in the IEER named
  486. entity format into a chunk structure. Chunks are of several
  487. types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
  488. PERCENT, MONEY, and MEASURE.
  489. :rtype: Tree
  490. """
  491. # Try looking for a single document. If that doesn't work, then just
  492. # treat everything as if it was within the <TEXT>...</TEXT>.
  493. m = _IEER_DOC_RE.match(s)
  494. if m:
  495. return {
  496. "text": _ieer_read_text(m.group("text"), root_label),
  497. "docno": m.group("docno"),
  498. "doctype": m.group("doctype"),
  499. "date_time": m.group("date_time"),
  500. #'headline': m.group('headline')
  501. # we want to capture NEs in the headline too!
  502. "headline": _ieer_read_text(m.group("headline"), root_label),
  503. }
  504. else:
  505. return _ieer_read_text(s, root_label)
  506. def demo():
  507. s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
  508. import nltk
  509. t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
  510. t.pprint()
  511. print()
  512. s = """
  513. These DT B-NP
  514. research NN I-NP
  515. protocols NNS I-NP
  516. offer VBP B-VP
  517. to TO B-PP
  518. the DT B-NP
  519. patient NN I-NP
  520. not RB O
  521. only RB O
  522. the DT B-NP
  523. very RB I-NP
  524. best JJS I-NP
  525. therapy NN I-NP
  526. which WDT B-NP
  527. we PRP B-NP
  528. have VBP B-VP
  529. established VBN I-VP
  530. today NN B-NP
  531. but CC B-NP
  532. also RB I-NP
  533. the DT B-NP
  534. hope NN I-NP
  535. of IN B-PP
  536. something NN B-NP
  537. still RB B-ADJP
  538. better JJR I-ADJP
  539. . . O
  540. """
  541. conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
  542. conll_tree.pprint()
  543. # Demonstrate CoNLL output
  544. print("CoNLL output:")
  545. print(nltk.chunk.tree2conllstr(conll_tree))
  546. print()
  547. if __name__ == "__main__":
  548. demo()