| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631 |
- # CHILDES XML Corpus Reader
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Tomonori Nagano <tnagano@gc.cuny.edu>
- # Alexis Dimitriadis <A.Dimitriadis@uu.nl>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Corpus reader for the XML version of the CHILDES corpus.
- """
- __docformat__ = "epytext en"
- import re
- from collections import defaultdict
- from nltk.util import flatten, LazyMap, LazyConcatenation
- from nltk.corpus.reader.util import concat
- from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
- # to resolve the namespace issue
- NS = "http://www.talkbank.org/ns/talkbank"
- class CHILDESCorpusReader(XMLCorpusReader):
- """
- Corpus reader for the XML version of the CHILDES corpus.
- The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
- version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
- Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
- (``nltk_data/corpora/CHILDES/``).
- For access to the file text use the usual nltk functions,
- ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
- """
- def __init__(self, root, fileids, lazy=True):
- XMLCorpusReader.__init__(self, root, fileids)
- self._lazy = lazy
- def words(
- self,
- fileids=None,
- speaker="ALL",
- stem=False,
- relation=False,
- strip_space=True,
- replace=False,
- ):
- """
- :return: the given file(s) as a list of words
- :rtype: list(str)
- :param speaker: If specified, select specific speaker(s) defined
- in the corpus. Default is 'ALL' (all participants). Common choices
- are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
- researchers)
- :param stem: If true, then use word stems instead of word strings.
- :param relation: If true, then return tuples of (stem, index,
- dependent_index)
- :param strip_space: If true, then strip trailing spaces from word
- tokens. Otherwise, leave the spaces on the tokens.
- :param replace: If true, then use the replaced (intended) word instead
- of the original word (e.g., 'wat' will be replaced with 'watch')
- """
- sent = None
- pos = False
- if not self._lazy:
- return [
- self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
- for fileid in self.abspaths(fileids)
- ]
- get_words = lambda fileid: self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
- return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
- def tagged_words(
- self,
- fileids=None,
- speaker="ALL",
- stem=False,
- relation=False,
- strip_space=True,
- replace=False,
- ):
- """
- :return: the given file(s) as a list of tagged
- words and punctuation symbols, encoded as tuples
- ``(word,tag)``.
- :rtype: list(tuple(str,str))
- :param speaker: If specified, select specific speaker(s) defined
- in the corpus. Default is 'ALL' (all participants). Common choices
- are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
- researchers)
- :param stem: If true, then use word stems instead of word strings.
- :param relation: If true, then return tuples of (stem, index,
- dependent_index)
- :param strip_space: If true, then strip trailing spaces from word
- tokens. Otherwise, leave the spaces on the tokens.
- :param replace: If true, then use the replaced (intended) word instead
- of the original word (e.g., 'wat' will be replaced with 'watch')
- """
- sent = None
- pos = True
- if not self._lazy:
- return [
- self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
- for fileid in self.abspaths(fileids)
- ]
- get_words = lambda fileid: self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
- return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
- def sents(
- self,
- fileids=None,
- speaker="ALL",
- stem=False,
- relation=None,
- strip_space=True,
- replace=False,
- ):
- """
- :return: the given file(s) as a list of sentences or utterances, each
- encoded as a list of word strings.
- :rtype: list(list(str))
- :param speaker: If specified, select specific speaker(s) defined
- in the corpus. Default is 'ALL' (all participants). Common choices
- are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
- researchers)
- :param stem: If true, then use word stems instead of word strings.
- :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
- If there is manually-annotated relation info, it will return
- tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
- :param strip_space: If true, then strip trailing spaces from word
- tokens. Otherwise, leave the spaces on the tokens.
- :param replace: If true, then use the replaced (intended) word instead
- of the original word (e.g., 'wat' will be replaced with 'watch')
- """
- sent = True
- pos = False
- if not self._lazy:
- return [
- self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
- for fileid in self.abspaths(fileids)
- ]
- get_words = lambda fileid: self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
- return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
- def tagged_sents(
- self,
- fileids=None,
- speaker="ALL",
- stem=False,
- relation=None,
- strip_space=True,
- replace=False,
- ):
- """
- :return: the given file(s) as a list of
- sentences, each encoded as a list of ``(word,tag)`` tuples.
- :rtype: list(list(tuple(str,str)))
- :param speaker: If specified, select specific speaker(s) defined
- in the corpus. Default is 'ALL' (all participants). Common choices
- are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
- researchers)
- :param stem: If true, then use word stems instead of word strings.
- :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
- If there is manually-annotated relation info, it will return
- tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
- :param strip_space: If true, then strip trailing spaces from word
- tokens. Otherwise, leave the spaces on the tokens.
- :param replace: If true, then use the replaced (intended) word instead
- of the original word (e.g., 'wat' will be replaced with 'watch')
- """
- sent = True
- pos = True
- if not self._lazy:
- return [
- self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
- for fileid in self.abspaths(fileids)
- ]
- get_words = lambda fileid: self._get_words(
- fileid, speaker, sent, stem, relation, pos, strip_space, replace
- )
- return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
- def corpus(self, fileids=None):
- """
- :return: the given file(s) as a dict of ``(corpus_property_key, value)``
- :rtype: list(dict)
- """
- if not self._lazy:
- return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
- return LazyMap(self._get_corpus, self.abspaths(fileids))
- def _get_corpus(self, fileid):
- results = dict()
- xmldoc = ElementTree.parse(fileid).getroot()
- for key, value in xmldoc.items():
- results[key] = value
- return results
- def participants(self, fileids=None):
- """
- :return: the given file(s) as a dict of
- ``(participant_property_key, value)``
- :rtype: list(dict)
- """
- if not self._lazy:
- return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
- return LazyMap(self._get_participants, self.abspaths(fileids))
- def _get_participants(self, fileid):
- # multidimensional dicts
- def dictOfDicts():
- return defaultdict(dictOfDicts)
- xmldoc = ElementTree.parse(fileid).getroot()
- # getting participants' data
- pat = dictOfDicts()
- for participant in xmldoc.findall(
- ".//{%s}Participants/{%s}participant" % (NS, NS)
- ):
- for (key, value) in participant.items():
- pat[participant.get("id")][key] = value
- return pat
- def age(self, fileids=None, speaker="CHI", month=False):
- """
- :return: the given file(s) as string or int
- :rtype: list or int
- :param month: If true, return months instead of year-month-date
- """
- if not self._lazy:
- return [
- self._get_age(fileid, speaker, month)
- for fileid in self.abspaths(fileids)
- ]
- get_age = lambda fileid: self._get_age(fileid, speaker, month)
- return LazyMap(get_age, self.abspaths(fileids))
- def _get_age(self, fileid, speaker, month):
- xmldoc = ElementTree.parse(fileid).getroot()
- for pat in xmldoc.findall(".//{%s}Participants/{%s}participant" % (NS, NS)):
- try:
- if pat.get("id") == speaker:
- age = pat.get("age")
- if month:
- age = self.convert_age(age)
- return age
- # some files don't have age data
- except (TypeError, AttributeError) as e:
- return None
- def convert_age(self, age_year):
- "Caclculate age in months from a string in CHILDES format"
- m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
- age_month = int(m.group(1)) * 12 + int(m.group(2))
- try:
- if int(m.group(3)) > 15:
- age_month += 1
- # some corpora don't have age information?
- except ValueError as e:
- pass
- return age_month
- def MLU(self, fileids=None, speaker="CHI"):
- """
- :return: the given file(s) as a floating number
- :rtype: list(float)
- """
- if not self._lazy:
- return [
- self._getMLU(fileid, speaker=speaker)
- for fileid in self.abspaths(fileids)
- ]
- get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
- return LazyMap(get_MLU, self.abspaths(fileids))
- def _getMLU(self, fileid, speaker):
- sents = self._get_words(
- fileid,
- speaker=speaker,
- sent=True,
- stem=True,
- relation=False,
- pos=True,
- strip_space=True,
- replace=True,
- )
- results = []
- lastSent = []
- numFillers = 0
- sentDiscount = 0
- for sent in sents:
- posList = [pos for (word, pos) in sent]
- # if any part of the sentence is intelligible
- if any(pos == "unk" for pos in posList):
- continue
- # if the sentence is null
- elif sent == []:
- continue
- # if the sentence is the same as the last sent
- elif sent == lastSent:
- continue
- else:
- results.append([word for (word, pos) in sent])
- # count number of fillers
- if len(set(["co", None]).intersection(posList)) > 0:
- numFillers += posList.count("co")
- numFillers += posList.count(None)
- sentDiscount += 1
- lastSent = sent
- try:
- thisWordList = flatten(results)
- # count number of morphemes
- # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
- numWords = (
- len(flatten([word.split("-") for word in thisWordList])) - numFillers
- )
- numSents = len(results) - sentDiscount
- mlu = numWords / numSents
- except ZeroDivisionError:
- mlu = 0
- # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
- return mlu
- def _get_words(
- self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
- ):
- if (
- isinstance(speaker, str) and speaker != "ALL"
- ): # ensure we have a list of speakers
- speaker = [speaker]
- xmldoc = ElementTree.parse(fileid).getroot()
- # processing each xml doc
- results = []
- for xmlsent in xmldoc.findall(".//{%s}u" % NS):
- sents = []
- # select speakers
- if speaker == "ALL" or xmlsent.get("who") in speaker:
- for xmlword in xmlsent.findall(".//{%s}w" % NS):
- infl = None
- suffixStem = None
- suffixTag = None
- # getting replaced words
- if replace and xmlsent.find(".//{%s}w/{%s}replacement" % (NS, NS)):
- xmlword = xmlsent.find(
- ".//{%s}w/{%s}replacement/{%s}w" % (NS, NS, NS)
- )
- elif replace and xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS)):
- xmlword = xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS))
- # get text
- if xmlword.text:
- word = xmlword.text
- else:
- word = ""
- # strip tailing space
- if strip_space:
- word = word.strip()
- # stem
- if relation or stem:
- try:
- xmlstem = xmlword.find(".//{%s}stem" % NS)
- word = xmlstem.text
- except AttributeError as e:
- pass
- # if there is an inflection
- try:
- xmlinfl = xmlword.find(
- ".//{%s}mor/{%s}mw/{%s}mk" % (NS, NS, NS)
- )
- word += "-" + xmlinfl.text
- except:
- pass
- # if there is a suffix
- try:
- xmlsuffix = xmlword.find(
- ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
- % (NS, NS, NS, NS)
- )
- suffixStem = xmlsuffix.text
- except AttributeError:
- suffixStem = ""
- if suffixStem:
- word += "~" + suffixStem
- # pos
- if relation or pos:
- try:
- xmlpos = xmlword.findall(".//{%s}c" % NS)
- xmlpos2 = xmlword.findall(".//{%s}s" % NS)
- if xmlpos2 != []:
- tag = xmlpos[0].text + ":" + xmlpos2[0].text
- else:
- tag = xmlpos[0].text
- except (AttributeError, IndexError) as e:
- tag = ""
- try:
- xmlsuffixpos = xmlword.findall(
- ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
- % (NS, NS, NS, NS, NS)
- )
- xmlsuffixpos2 = xmlword.findall(
- ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
- % (NS, NS, NS, NS, NS)
- )
- if xmlsuffixpos2:
- suffixTag = (
- xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
- )
- else:
- suffixTag = xmlsuffixpos[0].text
- except:
- pass
- if suffixTag:
- tag += "~" + suffixTag
- word = (word, tag)
- # relational
- # the gold standard is stored in
- # <mor></mor><mor type="trn"><gra type="grt">
- if relation == True:
- for xmlstem_rel in xmlword.findall(
- ".//{%s}mor/{%s}gra" % (NS, NS)
- ):
- if not xmlstem_rel.get("type") == "grt":
- word = (
- word[0],
- word[1],
- xmlstem_rel.get("index")
- + "|"
- + xmlstem_rel.get("head")
- + "|"
- + xmlstem_rel.get("relation"),
- )
- else:
- word = (
- word[0],
- word[1],
- word[2],
- word[0],
- word[1],
- xmlstem_rel.get("index")
- + "|"
- + xmlstem_rel.get("head")
- + "|"
- + xmlstem_rel.get("relation"),
- )
- try:
- for xmlpost_rel in xmlword.findall(
- ".//{%s}mor/{%s}mor-post/{%s}gra" % (NS, NS, NS)
- ):
- if not xmlpost_rel.get("type") == "grt":
- suffixStem = (
- suffixStem[0],
- suffixStem[1],
- xmlpost_rel.get("index")
- + "|"
- + xmlpost_rel.get("head")
- + "|"
- + xmlpost_rel.get("relation"),
- )
- else:
- suffixStem = (
- suffixStem[0],
- suffixStem[1],
- suffixStem[2],
- suffixStem[0],
- suffixStem[1],
- xmlpost_rel.get("index")
- + "|"
- + xmlpost_rel.get("head")
- + "|"
- + xmlpost_rel.get("relation"),
- )
- except:
- pass
- sents.append(word)
- if sent or relation:
- results.append(sents)
- else:
- results.extend(sents)
- return LazyMap(lambda x: x, results)
- # Ready-to-use browser opener
- """
- The base URL for viewing files on the childes website. This
- shouldn't need to be changed, unless CHILDES changes the configuration
- of their server or unless the user sets up their own corpus webserver.
- """
- childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="
- def webview_file(self, fileid, urlbase=None):
- """Map a corpus file to its web version on the CHILDES website,
- and open it in a web browser.
- The complete URL to be used is:
- childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
- If no urlbase is passed, we try to calculate it. This
- requires that the childes corpus was set up to mirror the
- folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
- nltk_data/corpora/childes/Eng-USA/Cornell/??? or
- nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
- The function first looks (as a special case) if "Eng-USA" is
- on the path consisting of <corpus root>+fileid; then if
- "childes", possibly followed by "data-xml", appears. If neither
- one is found, we use the unmodified fileid and hope for the best.
- If this is not right, specify urlbase explicitly, e.g., if the
- corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
- """
- import webbrowser
- if urlbase:
- path = urlbase + "/" + fileid
- else:
- full = self.root + "/" + fileid
- full = re.sub(r"\\", "/", full)
- if "/childes/" in full.lower():
- # Discard /data-xml/ if present
- path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
- elif "eng-usa" in full.lower():
- path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
- else:
- path = fileid
- # Strip ".xml" and add ".cha", as necessary:
- if path.endswith(".xml"):
- path = path[:-4]
- if not path.endswith(".cha"):
- path = path + ".cha"
- url = self.childes_url_base + path
- webbrowser.open_new_tab(url)
- print("Opening in browser:", url)
- # Pausing is a good idea, but it's up to the user...
- # raw_input("Hit Return to continue")
- def demo(corpus_root=None):
- """
- The CHILDES corpus should be manually downloaded and saved
- to ``[NLTK_Data_Dir]/corpora/childes/``
- """
- if not corpus_root:
- from nltk.data import find
- corpus_root = find("corpora/childes/data-xml/Eng-USA/")
- try:
- childes = CHILDESCorpusReader(corpus_root, ".*.xml")
- # describe all corpus
- for file in childes.fileids()[:5]:
- corpus = ""
- corpus_id = ""
- for (key, value) in childes.corpus(file)[0].items():
- if key == "Corpus":
- corpus = value
- if key == "Id":
- corpus_id = value
- print("Reading", corpus, corpus_id, " .....")
- print("words:", childes.words(file)[:7], "...")
- print(
- "words with replaced words:",
- childes.words(file, replace=True)[:7],
- " ...",
- )
- print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
- print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
- print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
- print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
- print(
- "words with relations and pos-tag:",
- childes.words(file, relation=True)[:5],
- " ...",
- )
- print("sentence:", childes.sents(file)[:2], " ...")
- for (participant, values) in childes.participants(file)[0].items():
- for (key, value) in values.items():
- print("\tparticipant", participant, key, ":", value)
- print("num of sent:", len(childes.sents(file)))
- print("num of morphemes:", len(childes.words(file, stem=True)))
- print("age:", childes.age(file))
- print("age in month:", childes.age(file, month=True))
- print("MLU:", childes.MLU(file))
- print()
- except LookupError as e:
- print(
- """The CHILDES corpus, or the parts you need, should be manually
- downloaded from https://childes.talkbank.org/data-xml/ and saved at
- [NLTK_Data_Dir]/corpora/childes/
- Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
- demo('/path/to/childes/data-xml/Eng-USA/")
- """
- )
- # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
- # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
- ##this fails
- # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
- if __name__ == "__main__":
- demo()
|