rte.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. # Natural Language Toolkit: RTE Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Ewan Klein <ewan@inf.ed.ac.uk>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
  9. The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
  10. were regularized.
  11. Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
  12. gold standard annotated files.
  13. Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
  14. example is taken from RTE3::
  15. <pair id="1" entailment="YES" task="IE" length="short" >
  16. <t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
  17. Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
  18. company Baikalfinansgroup which was later bought by the Russian
  19. state-owned oil company Rosneft .</t>
  20. <h>Baikalfinansgroup was sold to Rosneft.</h>
  21. </pair>
  22. In order to provide globally unique IDs for each pair, a new attribute
  23. ``challenge`` has been added to the root element ``entailment-corpus`` of each
  24. file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
  25. challenge number and 'n' is the pair ID.
  26. """
  27. from nltk.corpus.reader.util import *
  28. from nltk.corpus.reader.api import *
  29. from nltk.corpus.reader.xmldocs import *
  30. def norm(value_string):
  31. """
  32. Normalize the string value in an RTE pair's ``value`` or ``entailment``
  33. attribute as an integer (1, 0).
  34. :param value_string: the label used to classify a text/hypothesis pair
  35. :type value_string: str
  36. :rtype: int
  37. """
  38. valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
  39. return valdict[value_string.upper()]
  40. class RTEPair(object):
  41. """
  42. Container for RTE text-hypothesis pairs.
  43. The entailment relation is signalled by the ``value`` attribute in RTE1, and by
  44. ``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
  45. attribute of this class.
  46. """
  47. def __init__(
  48. self,
  49. pair,
  50. challenge=None,
  51. id=None,
  52. text=None,
  53. hyp=None,
  54. value=None,
  55. task=None,
  56. length=None,
  57. ):
  58. """
  59. :param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
  60. :param id: identifier for the pair
  61. :param text: the text component of the pair
  62. :param hyp: the hypothesis component of the pair
  63. :param value: classification label for the pair
  64. :param task: attribute for the particular NLP task that the data was drawn from
  65. :param length: attribute for the length of the text of the pair
  66. """
  67. self.challenge = challenge
  68. self.id = pair.attrib["id"]
  69. self.gid = "%s-%s" % (self.challenge, self.id)
  70. self.text = pair[0].text
  71. self.hyp = pair[1].text
  72. if "value" in pair.attrib:
  73. self.value = norm(pair.attrib["value"])
  74. elif "entailment" in pair.attrib:
  75. self.value = norm(pair.attrib["entailment"])
  76. else:
  77. self.value = value
  78. if "task" in pair.attrib:
  79. self.task = pair.attrib["task"]
  80. else:
  81. self.task = task
  82. if "length" in pair.attrib:
  83. self.length = pair.attrib["length"]
  84. else:
  85. self.length = length
  86. def __repr__(self):
  87. if self.challenge:
  88. return "<RTEPair: gid=%s-%s>" % (self.challenge, self.id)
  89. else:
  90. return "<RTEPair: id=%s>" % self.id
  91. class RTECorpusReader(XMLCorpusReader):
  92. """
  93. Corpus reader for corpora in RTE challenges.
  94. This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
  95. structure of input documents.
  96. """
  97. def _read_etree(self, doc):
  98. """
  99. Map the XML input into an RTEPair.
  100. This uses the ``getiterator()`` method from the ElementTree package to
  101. find all the ``<pair>`` elements.
  102. :param doc: a parsed XML document
  103. :rtype: list(RTEPair)
  104. """
  105. try:
  106. challenge = doc.attrib["challenge"]
  107. except KeyError:
  108. challenge = None
  109. return [RTEPair(pair, challenge=challenge) for pair in doc.getiterator("pair")]
  110. def pairs(self, fileids):
  111. """
  112. Build a list of RTEPairs from a RTE corpus.
  113. :param fileids: a list of RTE corpus fileids
  114. :type: list
  115. :rtype: list(RTEPair)
  116. """
  117. if isinstance(fileids, str):
  118. fileids = [fileids]
  119. return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])