| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134 |
- # Natural Language Toolkit: Dependency Corpus Reader
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Kepa Sarasola <kepa.sarasola@ehu.es>
- # Iker Manterola <returntothehangar@hotmail.com>
- #
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- import codecs
- from nltk.parse import DependencyGraph
- from nltk.tokenize import *
- from nltk.corpus.reader.util import *
- from nltk.corpus.reader.api import *
- class DependencyCorpusReader(SyntaxCorpusReader):
- def __init__(
- self,
- root,
- fileids,
- encoding="utf8",
- word_tokenizer=TabTokenizer(),
- sent_tokenizer=RegexpTokenizer("\n", gaps=True),
- para_block_reader=read_blankline_block,
- ):
- # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
- # from CorpusReader?
- CorpusReader.__init__(self, root, fileids, encoding)
- #########################################################
- def raw(self, fileids=None):
- """
- :return: the given file(s) as a single string.
- :rtype: str
- """
- result = []
- for fileid, encoding in self.abspaths(fileids, include_encoding=True):
- if isinstance(fileid, PathPointer):
- result.append(fileid.open(encoding=encoding).read())
- else:
- with codecs.open(fileid, "r", encoding) as fp:
- result.append(fp.read())
- return concat(result)
- def words(self, fileids=None):
- return concat(
- [
- DependencyCorpusView(fileid, False, False, False, encoding=enc)
- for fileid, enc in self.abspaths(fileids, include_encoding=True)
- ]
- )
- def tagged_words(self, fileids=None):
- return concat(
- [
- DependencyCorpusView(fileid, True, False, False, encoding=enc)
- for fileid, enc in self.abspaths(fileids, include_encoding=True)
- ]
- )
- def sents(self, fileids=None):
- return concat(
- [
- DependencyCorpusView(fileid, False, True, False, encoding=enc)
- for fileid, enc in self.abspaths(fileids, include_encoding=True)
- ]
- )
- def tagged_sents(self, fileids=None):
- return concat(
- [
- DependencyCorpusView(fileid, True, True, False, encoding=enc)
- for fileid, enc in self.abspaths(fileids, include_encoding=True)
- ]
- )
- def parsed_sents(self, fileids=None):
- sents = concat(
- [
- DependencyCorpusView(fileid, False, True, True, encoding=enc)
- for fileid, enc in self.abspaths(fileids, include_encoding=True)
- ]
- )
- return [DependencyGraph(sent) for sent in sents]
- class DependencyCorpusView(StreamBackedCorpusView):
- _DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da
- def __init__(
- self,
- corpus_file,
- tagged,
- group_by_sent,
- dependencies,
- chunk_types=None,
- encoding="utf8",
- ):
- self._tagged = tagged
- self._dependencies = dependencies
- self._group_by_sent = group_by_sent
- self._chunk_types = chunk_types
- StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
- def read_block(self, stream):
- # Read the next sentence.
- sent = read_blankline_block(stream)[0].strip()
- # Strip off the docstart marker, if present.
- if sent.startswith(self._DOCSTART):
- sent = sent[len(self._DOCSTART) :].lstrip()
- # extract word and tag from any of the formats
- if not self._dependencies:
- lines = [line.split("\t") for line in sent.split("\n")]
- if len(lines[0]) == 3 or len(lines[0]) == 4:
- sent = [(line[0], line[1]) for line in lines]
- elif len(lines[0]) == 10:
- sent = [(line[1], line[4]) for line in lines]
- else:
- raise ValueError("Unexpected number of fields in dependency tree file")
- # discard tags if they weren't requested
- if not self._tagged:
- sent = [word for (word, tag) in sent]
- # Return the result.
- if self._group_by_sent:
- return [sent]
- else:
- return list(sent)
|