conll.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587
  1. # Natural Language Toolkit: CONLL Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # Edward Loper <edloper@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Read CoNLL-style chunk fileids.
  10. """
  11. import textwrap
  12. from nltk.tree import Tree
  13. from nltk.util import LazyMap, LazyConcatenation
  14. from nltk.tag import map_tag
  15. from nltk.corpus.reader.util import *
  16. from nltk.corpus.reader.api import *
  17. class ConllCorpusReader(CorpusReader):
  18. """
  19. A corpus reader for CoNLL-style files. These files consist of a
  20. series of sentences, separated by blank lines. Each sentence is
  21. encoded using a table (or "grid") of values, where each line
  22. corresponds to a single word, and each column corresponds to an
  23. annotation type. The set of columns used by CoNLL-style files can
  24. vary from corpus to corpus; the ``ConllCorpusReader`` constructor
  25. therefore takes an argument, ``columntypes``, which is used to
  26. specify the columns that are used by a given corpus. By default
  27. columns are split by consecutive whitespaces, with the
  28. ``separator`` argument you can set a string to split by (e.g.
  29. ``\'\t\'``).
  30. @todo: Add support for reading from corpora where different
  31. parallel files contain different columns.
  32. @todo: Possibly add caching of the grid corpus view? This would
  33. allow the same grid view to be used by different data access
  34. methods (eg words() and parsed_sents() could both share the
  35. same grid corpus view object).
  36. @todo: Better support for -DOCSTART-. Currently, we just ignore
  37. it, but it could be used to define methods that retrieve a
  38. document at a time (eg parsed_documents()).
  39. """
  40. # /////////////////////////////////////////////////////////////////
  41. # Column Types
  42. # /////////////////////////////////////////////////////////////////
  43. WORDS = "words" #: column type for words
  44. POS = "pos" #: column type for part-of-speech tags
  45. TREE = "tree" #: column type for parse trees
  46. CHUNK = "chunk" #: column type for chunk structures
  47. NE = "ne" #: column type for named entities
  48. SRL = "srl" #: column type for semantic role labels
  49. IGNORE = "ignore" #: column type for column that should be ignored
  50. #: A list of all column types supported by the conll corpus reader.
  51. COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
  52. # /////////////////////////////////////////////////////////////////
  53. # Constructor
  54. # /////////////////////////////////////////////////////////////////
  55. def __init__(
  56. self,
  57. root,
  58. fileids,
  59. columntypes,
  60. chunk_types=None,
  61. root_label="S",
  62. pos_in_tree=False,
  63. srl_includes_roleset=True,
  64. encoding="utf8",
  65. tree_class=Tree,
  66. tagset=None,
  67. separator=None,
  68. ):
  69. for columntype in columntypes:
  70. if columntype not in self.COLUMN_TYPES:
  71. raise ValueError("Bad column type %r" % columntype)
  72. if isinstance(chunk_types, str):
  73. chunk_types = [chunk_types]
  74. self._chunk_types = chunk_types
  75. self._colmap = dict((c, i) for (i, c) in enumerate(columntypes))
  76. self._pos_in_tree = pos_in_tree
  77. self._root_label = root_label # for chunks
  78. self._srl_includes_roleset = srl_includes_roleset
  79. self._tree_class = tree_class
  80. CorpusReader.__init__(self, root, fileids, encoding)
  81. self._tagset = tagset
  82. self.sep = separator
  83. # /////////////////////////////////////////////////////////////////
  84. # Data Access Methods
  85. # /////////////////////////////////////////////////////////////////
  86. def raw(self, fileids=None):
  87. if fileids is None:
  88. fileids = self._fileids
  89. elif isinstance(fileids, str):
  90. fileids = [fileids]
  91. return concat([self.open(f).read() for f in fileids])
  92. def words(self, fileids=None):
  93. self._require(self.WORDS)
  94. return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
  95. def sents(self, fileids=None):
  96. self._require(self.WORDS)
  97. return LazyMap(self._get_words, self._grids(fileids))
  98. def tagged_words(self, fileids=None, tagset=None):
  99. self._require(self.WORDS, self.POS)
  100. def get_tagged_words(grid):
  101. return self._get_tagged_words(grid, tagset)
  102. return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
  103. def tagged_sents(self, fileids=None, tagset=None):
  104. self._require(self.WORDS, self.POS)
  105. def get_tagged_words(grid):
  106. return self._get_tagged_words(grid, tagset)
  107. return LazyMap(get_tagged_words, self._grids(fileids))
  108. def chunked_words(self, fileids=None, chunk_types=None, tagset=None):
  109. self._require(self.WORDS, self.POS, self.CHUNK)
  110. if chunk_types is None:
  111. chunk_types = self._chunk_types
  112. def get_chunked_words(grid): # capture chunk_types as local var
  113. return self._get_chunked_words(grid, chunk_types, tagset)
  114. return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
  115. def chunked_sents(self, fileids=None, chunk_types=None, tagset=None):
  116. self._require(self.WORDS, self.POS, self.CHUNK)
  117. if chunk_types is None:
  118. chunk_types = self._chunk_types
  119. def get_chunked_words(grid): # capture chunk_types as local var
  120. return self._get_chunked_words(grid, chunk_types, tagset)
  121. return LazyMap(get_chunked_words, self._grids(fileids))
  122. def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
  123. self._require(self.WORDS, self.POS, self.TREE)
  124. if pos_in_tree is None:
  125. pos_in_tree = self._pos_in_tree
  126. def get_parsed_sent(grid): # capture pos_in_tree as local var
  127. return self._get_parsed_sent(grid, pos_in_tree, tagset)
  128. return LazyMap(get_parsed_sent, self._grids(fileids))
  129. def srl_spans(self, fileids=None):
  130. self._require(self.SRL)
  131. return LazyMap(self._get_srl_spans, self._grids(fileids))
  132. def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
  133. self._require(self.WORDS, self.POS, self.TREE, self.SRL)
  134. if pos_in_tree is None:
  135. pos_in_tree = self._pos_in_tree
  136. def get_srl_instances(grid): # capture pos_in_tree as local var
  137. return self._get_srl_instances(grid, pos_in_tree)
  138. result = LazyMap(get_srl_instances, self._grids(fileids))
  139. if flatten:
  140. result = LazyConcatenation(result)
  141. return result
  142. def iob_words(self, fileids=None, tagset=None):
  143. """
  144. :return: a list of word/tag/IOB tuples
  145. :rtype: list(tuple)
  146. :param fileids: the list of fileids that make up this corpus
  147. :type fileids: None or str or list
  148. """
  149. self._require(self.WORDS, self.POS, self.CHUNK)
  150. def get_iob_words(grid):
  151. return self._get_iob_words(grid, tagset)
  152. return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
  153. def iob_sents(self, fileids=None, tagset=None):
  154. """
  155. :return: a list of lists of word/tag/IOB tuples
  156. :rtype: list(list)
  157. :param fileids: the list of fileids that make up this corpus
  158. :type fileids: None or str or list
  159. """
  160. self._require(self.WORDS, self.POS, self.CHUNK)
  161. def get_iob_words(grid):
  162. return self._get_iob_words(grid, tagset)
  163. return LazyMap(get_iob_words, self._grids(fileids))
  164. # /////////////////////////////////////////////////////////////////
  165. # Grid Reading
  166. # /////////////////////////////////////////////////////////////////
  167. def _grids(self, fileids=None):
  168. # n.b.: we could cache the object returned here (keyed on
  169. # fileids), which would let us reuse the same corpus view for
  170. # different things (eg srl and parse trees).
  171. return concat(
  172. [
  173. StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc)
  174. for (fileid, enc) in self.abspaths(fileids, True)
  175. ]
  176. )
  177. def _read_grid_block(self, stream):
  178. grids = []
  179. for block in read_blankline_block(stream):
  180. block = block.strip()
  181. if not block:
  182. continue
  183. grid = [line.split(self.sep) for line in block.split("\n")]
  184. # If there's a docstart row, then discard. ([xx] eventually it
  185. # would be good to actually use it)
  186. if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-":
  187. del grid[0]
  188. # Check that the grid is consistent.
  189. for row in grid:
  190. if len(row) != len(grid[0]):
  191. raise ValueError("Inconsistent number of columns:\n%s" % block)
  192. grids.append(grid)
  193. return grids
  194. # /////////////////////////////////////////////////////////////////
  195. # Transforms
  196. # /////////////////////////////////////////////////////////////////
  197. # given a grid, transform it into some representation (e.g.,
  198. # a list of words or a parse tree).
  199. def _get_words(self, grid):
  200. return self._get_column(grid, self._colmap["words"])
  201. def _get_tagged_words(self, grid, tagset=None):
  202. pos_tags = self._get_column(grid, self._colmap["pos"])
  203. if tagset and tagset != self._tagset:
  204. pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
  205. return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags))
  206. def _get_iob_words(self, grid, tagset=None):
  207. pos_tags = self._get_column(grid, self._colmap["pos"])
  208. if tagset and tagset != self._tagset:
  209. pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
  210. return list(
  211. zip(
  212. self._get_column(grid, self._colmap["words"]),
  213. pos_tags,
  214. self._get_column(grid, self._colmap["chunk"]),
  215. )
  216. )
  217. def _get_chunked_words(self, grid, chunk_types, tagset=None):
  218. # n.b.: this method is very similar to conllstr2tree.
  219. words = self._get_column(grid, self._colmap["words"])
  220. pos_tags = self._get_column(grid, self._colmap["pos"])
  221. if tagset and tagset != self._tagset:
  222. pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
  223. chunk_tags = self._get_column(grid, self._colmap["chunk"])
  224. stack = [Tree(self._root_label, [])]
  225. for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
  226. if chunk_tag == "O":
  227. state, chunk_type = "O", ""
  228. else:
  229. (state, chunk_type) = chunk_tag.split("-")
  230. # If it's a chunk we don't care about, treat it as O.
  231. if chunk_types is not None and chunk_type not in chunk_types:
  232. state = "O"
  233. # Treat a mismatching I like a B.
  234. if state == "I" and chunk_type != stack[-1].label():
  235. state = "B"
  236. # For B or I: close any open chunks
  237. if state in "BO" and len(stack) == 2:
  238. stack.pop()
  239. # For B: start a new chunk.
  240. if state == "B":
  241. new_chunk = Tree(chunk_type, [])
  242. stack[-1].append(new_chunk)
  243. stack.append(new_chunk)
  244. # Add the word token.
  245. stack[-1].append((word, pos_tag))
  246. return stack[0]
  247. def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
  248. words = self._get_column(grid, self._colmap["words"])
  249. pos_tags = self._get_column(grid, self._colmap["pos"])
  250. if tagset and tagset != self._tagset:
  251. pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
  252. parse_tags = self._get_column(grid, self._colmap["tree"])
  253. treestr = ""
  254. for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
  255. if word == "(":
  256. word = "-LRB-"
  257. if word == ")":
  258. word = "-RRB-"
  259. if pos_tag == "(":
  260. pos_tag = "-LRB-"
  261. if pos_tag == ")":
  262. pos_tag = "-RRB-"
  263. (left, right) = parse_tag.split("*")
  264. right = right.count(")") * ")" # only keep ')'.
  265. treestr += "%s (%s %s) %s" % (left, pos_tag, word, right)
  266. try:
  267. tree = self._tree_class.fromstring(treestr)
  268. except (ValueError, IndexError):
  269. tree = self._tree_class.fromstring("(%s %s)" % (self._root_label, treestr))
  270. if not pos_in_tree:
  271. for subtree in tree.subtrees():
  272. for i, child in enumerate(subtree):
  273. if (
  274. isinstance(child, Tree)
  275. and len(child) == 1
  276. and isinstance(child[0], str)
  277. ):
  278. subtree[i] = (child[0], child.label())
  279. return tree
  280. def _get_srl_spans(self, grid):
  281. """
  282. list of list of (start, end), tag) tuples
  283. """
  284. if self._srl_includes_roleset:
  285. predicates = self._get_column(grid, self._colmap["srl"] + 1)
  286. start_col = self._colmap["srl"] + 2
  287. else:
  288. predicates = self._get_column(grid, self._colmap["srl"])
  289. start_col = self._colmap["srl"] + 1
  290. # Count how many predicates there are. This tells us how many
  291. # columns to expect for SRL data.
  292. num_preds = len([p for p in predicates if p != "-"])
  293. spanlists = []
  294. for i in range(num_preds):
  295. col = self._get_column(grid, start_col + i)
  296. spanlist = []
  297. stack = []
  298. for wordnum, srl_tag in enumerate(col):
  299. (left, right) = srl_tag.split("*")
  300. for tag in left.split("("):
  301. if tag:
  302. stack.append((tag, wordnum))
  303. for i in range(right.count(")")):
  304. (tag, start) = stack.pop()
  305. spanlist.append(((start, wordnum + 1), tag))
  306. spanlists.append(spanlist)
  307. return spanlists
  308. def _get_srl_instances(self, grid, pos_in_tree):
  309. tree = self._get_parsed_sent(grid, pos_in_tree)
  310. spanlists = self._get_srl_spans(grid)
  311. if self._srl_includes_roleset:
  312. predicates = self._get_column(grid, self._colmap["srl"] + 1)
  313. rolesets = self._get_column(grid, self._colmap["srl"])
  314. else:
  315. predicates = self._get_column(grid, self._colmap["srl"])
  316. rolesets = [None] * len(predicates)
  317. instances = ConllSRLInstanceList(tree)
  318. for wordnum, predicate in enumerate(predicates):
  319. if predicate == "-":
  320. continue
  321. # Decide which spanlist to use. Don't assume that they're
  322. # sorted in the same order as the predicates (even though
  323. # they usually are).
  324. for spanlist in spanlists:
  325. for (start, end), tag in spanlist:
  326. if wordnum in range(start, end) and tag in ("V", "C-V"):
  327. break
  328. else:
  329. continue
  330. break
  331. else:
  332. raise ValueError("No srl column found for %r" % predicate)
  333. instances.append(
  334. ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
  335. )
  336. return instances
  337. # /////////////////////////////////////////////////////////////////
  338. # Helper Methods
  339. # /////////////////////////////////////////////////////////////////
  340. def _require(self, *columntypes):
  341. for columntype in columntypes:
  342. if columntype not in self._colmap:
  343. raise ValueError(
  344. "This corpus does not contain a %s " "column." % columntype
  345. )
  346. @staticmethod
  347. def _get_column(grid, column_index):
  348. return [grid[i][column_index] for i in range(len(grid))]
  349. class ConllSRLInstance(object):
  350. """
  351. An SRL instance from a CoNLL corpus, which identifies and
  352. providing labels for the arguments of a single verb.
  353. """
  354. # [xx] add inst.core_arguments, inst.argm_arguments?
  355. def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
  356. self.verb = []
  357. """A list of the word indices of the words that compose the
  358. verb whose arguments are identified by this instance.
  359. This will contain multiple word indices when multi-word
  360. verbs are used (e.g. 'turn on')."""
  361. self.verb_head = verb_head
  362. """The word index of the head word of the verb whose arguments
  363. are identified by this instance. E.g., for a sentence that
  364. uses the verb 'turn on,' ``verb_head`` will be the word index
  365. of the word 'turn'."""
  366. self.verb_stem = verb_stem
  367. self.roleset = roleset
  368. self.arguments = []
  369. """A list of ``(argspan, argid)`` tuples, specifying the location
  370. and type for each of the arguments identified by this
  371. instance. ``argspan`` is a tuple ``start, end``, indicating
  372. that the argument consists of the ``words[start:end]``."""
  373. self.tagged_spans = tagged_spans
  374. """A list of ``(span, id)`` tuples, specifying the location and
  375. type for each of the arguments, as well as the verb pieces,
  376. that make up this instance."""
  377. self.tree = tree
  378. """The parse tree for the sentence containing this instance."""
  379. self.words = tree.leaves()
  380. """A list of the words in the sentence containing this
  381. instance."""
  382. # Fill in the self.verb and self.arguments values.
  383. for (start, end), tag in tagged_spans:
  384. if tag in ("V", "C-V"):
  385. self.verb += list(range(start, end))
  386. else:
  387. self.arguments.append(((start, end), tag))
  388. def __repr__(self):
  389. # Originally, its:
  390. ##plural = 's' if len(self.arguments) != 1 else ''
  391. plural = "s" if len(self.arguments) != 1 else ""
  392. return "<ConllSRLInstance for %r with %d argument%s>" % (
  393. (self.verb_stem, len(self.arguments), plural)
  394. )
  395. def pprint(self):
  396. verbstr = " ".join(self.words[i][0] for i in self.verb)
  397. hdr = "SRL for %r (stem=%r):\n" % (verbstr, self.verb_stem)
  398. s = ""
  399. for i, word in enumerate(self.words):
  400. if isinstance(word, tuple):
  401. word = word[0]
  402. for (start, end), argid in self.arguments:
  403. if i == start:
  404. s += "[%s " % argid
  405. if i == end:
  406. s += "] "
  407. if i in self.verb:
  408. word = "<<%s>>" % word
  409. s += word + " "
  410. return hdr + textwrap.fill(
  411. s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" "
  412. )
  413. class ConllSRLInstanceList(list):
  414. """
  415. Set of instances for a single sentence
  416. """
  417. def __init__(self, tree, instances=()):
  418. self.tree = tree
  419. list.__init__(self, instances)
  420. def __str__(self):
  421. return self.pprint()
  422. def pprint(self, include_tree=False):
  423. # Sanity check: trees should be the same
  424. for inst in self:
  425. if inst.tree != self.tree:
  426. raise ValueError("Tree mismatch!")
  427. # If desired, add trees:
  428. if include_tree:
  429. words = self.tree.leaves()
  430. pos = [None] * len(words)
  431. synt = ["*"] * len(words)
  432. self._tree2conll(self.tree, 0, words, pos, synt)
  433. s = ""
  434. for i in range(len(words)):
  435. # optional tree columns
  436. if include_tree:
  437. s += "%-20s " % words[i]
  438. s += "%-8s " % pos[i]
  439. s += "%15s*%-8s " % tuple(synt[i].split("*"))
  440. # verb head column
  441. for inst in self:
  442. if i == inst.verb_head:
  443. s += "%-20s " % inst.verb_stem
  444. break
  445. else:
  446. s += "%-20s " % "-"
  447. # Remaining columns: self
  448. for inst in self:
  449. argstr = "*"
  450. for (start, end), argid in inst.tagged_spans:
  451. if i == start:
  452. argstr = "(%s%s" % (argid, argstr)
  453. if i == (end - 1):
  454. argstr += ")"
  455. s += "%-12s " % argstr
  456. s += "\n"
  457. return s
  458. def _tree2conll(self, tree, wordnum, words, pos, synt):
  459. assert isinstance(tree, Tree)
  460. if len(tree) == 1 and isinstance(tree[0], str):
  461. pos[wordnum] = tree.label()
  462. assert words[wordnum] == tree[0]
  463. return wordnum + 1
  464. elif len(tree) == 1 and isinstance(tree[0], tuple):
  465. assert len(tree[0]) == 2
  466. pos[wordnum], pos[wordnum] = tree[0]
  467. return wordnum + 1
  468. else:
  469. synt[wordnum] = "(%s%s" % (tree.label(), synt[wordnum])
  470. for child in tree:
  471. wordnum = self._tree2conll(child, wordnum, words, pos, synt)
  472. synt[wordnum - 1] += ")"
  473. return wordnum
  474. class ConllChunkCorpusReader(ConllCorpusReader):
  475. """
  476. A ConllCorpusReader whose data file contains three columns: words,
  477. pos, and chunk.
  478. """
  479. def __init__(
  480. self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None
  481. ):
  482. ConllCorpusReader.__init__(
  483. self,
  484. root,
  485. fileids,
  486. ("words", "pos", "chunk"),
  487. chunk_types=chunk_types,
  488. encoding=encoding,
  489. tagset=tagset,
  490. separator=separator,
  491. )