childes.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. # CHILDES XML Corpus Reader
  2. # Copyright (C) 2001-2020 NLTK Project
  3. # Author: Tomonori Nagano <tnagano@gc.cuny.edu>
  4. # Alexis Dimitriadis <A.Dimitriadis@uu.nl>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. Corpus reader for the XML version of the CHILDES corpus.
  9. """
  10. __docformat__ = "epytext en"
  11. import re
  12. from collections import defaultdict
  13. from nltk.util import flatten, LazyMap, LazyConcatenation
  14. from nltk.corpus.reader.util import concat
  15. from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
  16. # to resolve the namespace issue
  17. NS = "http://www.talkbank.org/ns/talkbank"
  18. class CHILDESCorpusReader(XMLCorpusReader):
  19. """
  20. Corpus reader for the XML version of the CHILDES corpus.
  21. The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
  22. version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
  23. Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
  24. (``nltk_data/corpora/CHILDES/``).
  25. For access to the file text use the usual nltk functions,
  26. ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
  27. """
  28. def __init__(self, root, fileids, lazy=True):
  29. XMLCorpusReader.__init__(self, root, fileids)
  30. self._lazy = lazy
  31. def words(
  32. self,
  33. fileids=None,
  34. speaker="ALL",
  35. stem=False,
  36. relation=False,
  37. strip_space=True,
  38. replace=False,
  39. ):
  40. """
  41. :return: the given file(s) as a list of words
  42. :rtype: list(str)
  43. :param speaker: If specified, select specific speaker(s) defined
  44. in the corpus. Default is 'ALL' (all participants). Common choices
  45. are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
  46. researchers)
  47. :param stem: If true, then use word stems instead of word strings.
  48. :param relation: If true, then return tuples of (stem, index,
  49. dependent_index)
  50. :param strip_space: If true, then strip trailing spaces from word
  51. tokens. Otherwise, leave the spaces on the tokens.
  52. :param replace: If true, then use the replaced (intended) word instead
  53. of the original word (e.g., 'wat' will be replaced with 'watch')
  54. """
  55. sent = None
  56. pos = False
  57. if not self._lazy:
  58. return [
  59. self._get_words(
  60. fileid, speaker, sent, stem, relation, pos, strip_space, replace
  61. )
  62. for fileid in self.abspaths(fileids)
  63. ]
  64. get_words = lambda fileid: self._get_words(
  65. fileid, speaker, sent, stem, relation, pos, strip_space, replace
  66. )
  67. return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
  68. def tagged_words(
  69. self,
  70. fileids=None,
  71. speaker="ALL",
  72. stem=False,
  73. relation=False,
  74. strip_space=True,
  75. replace=False,
  76. ):
  77. """
  78. :return: the given file(s) as a list of tagged
  79. words and punctuation symbols, encoded as tuples
  80. ``(word,tag)``.
  81. :rtype: list(tuple(str,str))
  82. :param speaker: If specified, select specific speaker(s) defined
  83. in the corpus. Default is 'ALL' (all participants). Common choices
  84. are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
  85. researchers)
  86. :param stem: If true, then use word stems instead of word strings.
  87. :param relation: If true, then return tuples of (stem, index,
  88. dependent_index)
  89. :param strip_space: If true, then strip trailing spaces from word
  90. tokens. Otherwise, leave the spaces on the tokens.
  91. :param replace: If true, then use the replaced (intended) word instead
  92. of the original word (e.g., 'wat' will be replaced with 'watch')
  93. """
  94. sent = None
  95. pos = True
  96. if not self._lazy:
  97. return [
  98. self._get_words(
  99. fileid, speaker, sent, stem, relation, pos, strip_space, replace
  100. )
  101. for fileid in self.abspaths(fileids)
  102. ]
  103. get_words = lambda fileid: self._get_words(
  104. fileid, speaker, sent, stem, relation, pos, strip_space, replace
  105. )
  106. return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
  107. def sents(
  108. self,
  109. fileids=None,
  110. speaker="ALL",
  111. stem=False,
  112. relation=None,
  113. strip_space=True,
  114. replace=False,
  115. ):
  116. """
  117. :return: the given file(s) as a list of sentences or utterances, each
  118. encoded as a list of word strings.
  119. :rtype: list(list(str))
  120. :param speaker: If specified, select specific speaker(s) defined
  121. in the corpus. Default is 'ALL' (all participants). Common choices
  122. are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
  123. researchers)
  124. :param stem: If true, then use word stems instead of word strings.
  125. :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
  126. If there is manually-annotated relation info, it will return
  127. tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
  128. :param strip_space: If true, then strip trailing spaces from word
  129. tokens. Otherwise, leave the spaces on the tokens.
  130. :param replace: If true, then use the replaced (intended) word instead
  131. of the original word (e.g., 'wat' will be replaced with 'watch')
  132. """
  133. sent = True
  134. pos = False
  135. if not self._lazy:
  136. return [
  137. self._get_words(
  138. fileid, speaker, sent, stem, relation, pos, strip_space, replace
  139. )
  140. for fileid in self.abspaths(fileids)
  141. ]
  142. get_words = lambda fileid: self._get_words(
  143. fileid, speaker, sent, stem, relation, pos, strip_space, replace
  144. )
  145. return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
  146. def tagged_sents(
  147. self,
  148. fileids=None,
  149. speaker="ALL",
  150. stem=False,
  151. relation=None,
  152. strip_space=True,
  153. replace=False,
  154. ):
  155. """
  156. :return: the given file(s) as a list of
  157. sentences, each encoded as a list of ``(word,tag)`` tuples.
  158. :rtype: list(list(tuple(str,str)))
  159. :param speaker: If specified, select specific speaker(s) defined
  160. in the corpus. Default is 'ALL' (all participants). Common choices
  161. are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
  162. researchers)
  163. :param stem: If true, then use word stems instead of word strings.
  164. :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
  165. If there is manually-annotated relation info, it will return
  166. tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
  167. :param strip_space: If true, then strip trailing spaces from word
  168. tokens. Otherwise, leave the spaces on the tokens.
  169. :param replace: If true, then use the replaced (intended) word instead
  170. of the original word (e.g., 'wat' will be replaced with 'watch')
  171. """
  172. sent = True
  173. pos = True
  174. if not self._lazy:
  175. return [
  176. self._get_words(
  177. fileid, speaker, sent, stem, relation, pos, strip_space, replace
  178. )
  179. for fileid in self.abspaths(fileids)
  180. ]
  181. get_words = lambda fileid: self._get_words(
  182. fileid, speaker, sent, stem, relation, pos, strip_space, replace
  183. )
  184. return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
  185. def corpus(self, fileids=None):
  186. """
  187. :return: the given file(s) as a dict of ``(corpus_property_key, value)``
  188. :rtype: list(dict)
  189. """
  190. if not self._lazy:
  191. return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
  192. return LazyMap(self._get_corpus, self.abspaths(fileids))
  193. def _get_corpus(self, fileid):
  194. results = dict()
  195. xmldoc = ElementTree.parse(fileid).getroot()
  196. for key, value in xmldoc.items():
  197. results[key] = value
  198. return results
  199. def participants(self, fileids=None):
  200. """
  201. :return: the given file(s) as a dict of
  202. ``(participant_property_key, value)``
  203. :rtype: list(dict)
  204. """
  205. if not self._lazy:
  206. return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
  207. return LazyMap(self._get_participants, self.abspaths(fileids))
  208. def _get_participants(self, fileid):
  209. # multidimensional dicts
  210. def dictOfDicts():
  211. return defaultdict(dictOfDicts)
  212. xmldoc = ElementTree.parse(fileid).getroot()
  213. # getting participants' data
  214. pat = dictOfDicts()
  215. for participant in xmldoc.findall(
  216. ".//{%s}Participants/{%s}participant" % (NS, NS)
  217. ):
  218. for (key, value) in participant.items():
  219. pat[participant.get("id")][key] = value
  220. return pat
  221. def age(self, fileids=None, speaker="CHI", month=False):
  222. """
  223. :return: the given file(s) as string or int
  224. :rtype: list or int
  225. :param month: If true, return months instead of year-month-date
  226. """
  227. if not self._lazy:
  228. return [
  229. self._get_age(fileid, speaker, month)
  230. for fileid in self.abspaths(fileids)
  231. ]
  232. get_age = lambda fileid: self._get_age(fileid, speaker, month)
  233. return LazyMap(get_age, self.abspaths(fileids))
  234. def _get_age(self, fileid, speaker, month):
  235. xmldoc = ElementTree.parse(fileid).getroot()
  236. for pat in xmldoc.findall(".//{%s}Participants/{%s}participant" % (NS, NS)):
  237. try:
  238. if pat.get("id") == speaker:
  239. age = pat.get("age")
  240. if month:
  241. age = self.convert_age(age)
  242. return age
  243. # some files don't have age data
  244. except (TypeError, AttributeError) as e:
  245. return None
  246. def convert_age(self, age_year):
  247. "Caclculate age in months from a string in CHILDES format"
  248. m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
  249. age_month = int(m.group(1)) * 12 + int(m.group(2))
  250. try:
  251. if int(m.group(3)) > 15:
  252. age_month += 1
  253. # some corpora don't have age information?
  254. except ValueError as e:
  255. pass
  256. return age_month
  257. def MLU(self, fileids=None, speaker="CHI"):
  258. """
  259. :return: the given file(s) as a floating number
  260. :rtype: list(float)
  261. """
  262. if not self._lazy:
  263. return [
  264. self._getMLU(fileid, speaker=speaker)
  265. for fileid in self.abspaths(fileids)
  266. ]
  267. get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
  268. return LazyMap(get_MLU, self.abspaths(fileids))
  269. def _getMLU(self, fileid, speaker):
  270. sents = self._get_words(
  271. fileid,
  272. speaker=speaker,
  273. sent=True,
  274. stem=True,
  275. relation=False,
  276. pos=True,
  277. strip_space=True,
  278. replace=True,
  279. )
  280. results = []
  281. lastSent = []
  282. numFillers = 0
  283. sentDiscount = 0
  284. for sent in sents:
  285. posList = [pos for (word, pos) in sent]
  286. # if any part of the sentence is intelligible
  287. if any(pos == "unk" for pos in posList):
  288. continue
  289. # if the sentence is null
  290. elif sent == []:
  291. continue
  292. # if the sentence is the same as the last sent
  293. elif sent == lastSent:
  294. continue
  295. else:
  296. results.append([word for (word, pos) in sent])
  297. # count number of fillers
  298. if len(set(["co", None]).intersection(posList)) > 0:
  299. numFillers += posList.count("co")
  300. numFillers += posList.count(None)
  301. sentDiscount += 1
  302. lastSent = sent
  303. try:
  304. thisWordList = flatten(results)
  305. # count number of morphemes
  306. # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
  307. numWords = (
  308. len(flatten([word.split("-") for word in thisWordList])) - numFillers
  309. )
  310. numSents = len(results) - sentDiscount
  311. mlu = numWords / numSents
  312. except ZeroDivisionError:
  313. mlu = 0
  314. # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
  315. return mlu
  316. def _get_words(
  317. self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
  318. ):
  319. if (
  320. isinstance(speaker, str) and speaker != "ALL"
  321. ): # ensure we have a list of speakers
  322. speaker = [speaker]
  323. xmldoc = ElementTree.parse(fileid).getroot()
  324. # processing each xml doc
  325. results = []
  326. for xmlsent in xmldoc.findall(".//{%s}u" % NS):
  327. sents = []
  328. # select speakers
  329. if speaker == "ALL" or xmlsent.get("who") in speaker:
  330. for xmlword in xmlsent.findall(".//{%s}w" % NS):
  331. infl = None
  332. suffixStem = None
  333. suffixTag = None
  334. # getting replaced words
  335. if replace and xmlsent.find(".//{%s}w/{%s}replacement" % (NS, NS)):
  336. xmlword = xmlsent.find(
  337. ".//{%s}w/{%s}replacement/{%s}w" % (NS, NS, NS)
  338. )
  339. elif replace and xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS)):
  340. xmlword = xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS))
  341. # get text
  342. if xmlword.text:
  343. word = xmlword.text
  344. else:
  345. word = ""
  346. # strip tailing space
  347. if strip_space:
  348. word = word.strip()
  349. # stem
  350. if relation or stem:
  351. try:
  352. xmlstem = xmlword.find(".//{%s}stem" % NS)
  353. word = xmlstem.text
  354. except AttributeError as e:
  355. pass
  356. # if there is an inflection
  357. try:
  358. xmlinfl = xmlword.find(
  359. ".//{%s}mor/{%s}mw/{%s}mk" % (NS, NS, NS)
  360. )
  361. word += "-" + xmlinfl.text
  362. except:
  363. pass
  364. # if there is a suffix
  365. try:
  366. xmlsuffix = xmlword.find(
  367. ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
  368. % (NS, NS, NS, NS)
  369. )
  370. suffixStem = xmlsuffix.text
  371. except AttributeError:
  372. suffixStem = ""
  373. if suffixStem:
  374. word += "~" + suffixStem
  375. # pos
  376. if relation or pos:
  377. try:
  378. xmlpos = xmlword.findall(".//{%s}c" % NS)
  379. xmlpos2 = xmlword.findall(".//{%s}s" % NS)
  380. if xmlpos2 != []:
  381. tag = xmlpos[0].text + ":" + xmlpos2[0].text
  382. else:
  383. tag = xmlpos[0].text
  384. except (AttributeError, IndexError) as e:
  385. tag = ""
  386. try:
  387. xmlsuffixpos = xmlword.findall(
  388. ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
  389. % (NS, NS, NS, NS, NS)
  390. )
  391. xmlsuffixpos2 = xmlword.findall(
  392. ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
  393. % (NS, NS, NS, NS, NS)
  394. )
  395. if xmlsuffixpos2:
  396. suffixTag = (
  397. xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
  398. )
  399. else:
  400. suffixTag = xmlsuffixpos[0].text
  401. except:
  402. pass
  403. if suffixTag:
  404. tag += "~" + suffixTag
  405. word = (word, tag)
  406. # relational
  407. # the gold standard is stored in
  408. # <mor></mor><mor type="trn"><gra type="grt">
  409. if relation == True:
  410. for xmlstem_rel in xmlword.findall(
  411. ".//{%s}mor/{%s}gra" % (NS, NS)
  412. ):
  413. if not xmlstem_rel.get("type") == "grt":
  414. word = (
  415. word[0],
  416. word[1],
  417. xmlstem_rel.get("index")
  418. + "|"
  419. + xmlstem_rel.get("head")
  420. + "|"
  421. + xmlstem_rel.get("relation"),
  422. )
  423. else:
  424. word = (
  425. word[0],
  426. word[1],
  427. word[2],
  428. word[0],
  429. word[1],
  430. xmlstem_rel.get("index")
  431. + "|"
  432. + xmlstem_rel.get("head")
  433. + "|"
  434. + xmlstem_rel.get("relation"),
  435. )
  436. try:
  437. for xmlpost_rel in xmlword.findall(
  438. ".//{%s}mor/{%s}mor-post/{%s}gra" % (NS, NS, NS)
  439. ):
  440. if not xmlpost_rel.get("type") == "grt":
  441. suffixStem = (
  442. suffixStem[0],
  443. suffixStem[1],
  444. xmlpost_rel.get("index")
  445. + "|"
  446. + xmlpost_rel.get("head")
  447. + "|"
  448. + xmlpost_rel.get("relation"),
  449. )
  450. else:
  451. suffixStem = (
  452. suffixStem[0],
  453. suffixStem[1],
  454. suffixStem[2],
  455. suffixStem[0],
  456. suffixStem[1],
  457. xmlpost_rel.get("index")
  458. + "|"
  459. + xmlpost_rel.get("head")
  460. + "|"
  461. + xmlpost_rel.get("relation"),
  462. )
  463. except:
  464. pass
  465. sents.append(word)
  466. if sent or relation:
  467. results.append(sents)
  468. else:
  469. results.extend(sents)
  470. return LazyMap(lambda x: x, results)
  471. # Ready-to-use browser opener
  472. """
  473. The base URL for viewing files on the childes website. This
  474. shouldn't need to be changed, unless CHILDES changes the configuration
  475. of their server or unless the user sets up their own corpus webserver.
  476. """
  477. childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="
  478. def webview_file(self, fileid, urlbase=None):
  479. """Map a corpus file to its web version on the CHILDES website,
  480. and open it in a web browser.
  481. The complete URL to be used is:
  482. childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
  483. If no urlbase is passed, we try to calculate it. This
  484. requires that the childes corpus was set up to mirror the
  485. folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
  486. nltk_data/corpora/childes/Eng-USA/Cornell/??? or
  487. nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
  488. The function first looks (as a special case) if "Eng-USA" is
  489. on the path consisting of <corpus root>+fileid; then if
  490. "childes", possibly followed by "data-xml", appears. If neither
  491. one is found, we use the unmodified fileid and hope for the best.
  492. If this is not right, specify urlbase explicitly, e.g., if the
  493. corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
  494. """
  495. import webbrowser
  496. if urlbase:
  497. path = urlbase + "/" + fileid
  498. else:
  499. full = self.root + "/" + fileid
  500. full = re.sub(r"\\", "/", full)
  501. if "/childes/" in full.lower():
  502. # Discard /data-xml/ if present
  503. path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
  504. elif "eng-usa" in full.lower():
  505. path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
  506. else:
  507. path = fileid
  508. # Strip ".xml" and add ".cha", as necessary:
  509. if path.endswith(".xml"):
  510. path = path[:-4]
  511. if not path.endswith(".cha"):
  512. path = path + ".cha"
  513. url = self.childes_url_base + path
  514. webbrowser.open_new_tab(url)
  515. print("Opening in browser:", url)
  516. # Pausing is a good idea, but it's up to the user...
  517. # raw_input("Hit Return to continue")
  518. def demo(corpus_root=None):
  519. """
  520. The CHILDES corpus should be manually downloaded and saved
  521. to ``[NLTK_Data_Dir]/corpora/childes/``
  522. """
  523. if not corpus_root:
  524. from nltk.data import find
  525. corpus_root = find("corpora/childes/data-xml/Eng-USA/")
  526. try:
  527. childes = CHILDESCorpusReader(corpus_root, ".*.xml")
  528. # describe all corpus
  529. for file in childes.fileids()[:5]:
  530. corpus = ""
  531. corpus_id = ""
  532. for (key, value) in childes.corpus(file)[0].items():
  533. if key == "Corpus":
  534. corpus = value
  535. if key == "Id":
  536. corpus_id = value
  537. print("Reading", corpus, corpus_id, " .....")
  538. print("words:", childes.words(file)[:7], "...")
  539. print(
  540. "words with replaced words:",
  541. childes.words(file, replace=True)[:7],
  542. " ...",
  543. )
  544. print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
  545. print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
  546. print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
  547. print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
  548. print(
  549. "words with relations and pos-tag:",
  550. childes.words(file, relation=True)[:5],
  551. " ...",
  552. )
  553. print("sentence:", childes.sents(file)[:2], " ...")
  554. for (participant, values) in childes.participants(file)[0].items():
  555. for (key, value) in values.items():
  556. print("\tparticipant", participant, key, ":", value)
  557. print("num of sent:", len(childes.sents(file)))
  558. print("num of morphemes:", len(childes.words(file, stem=True)))
  559. print("age:", childes.age(file))
  560. print("age in month:", childes.age(file, month=True))
  561. print("MLU:", childes.MLU(file))
  562. print()
  563. except LookupError as e:
  564. print(
  565. """The CHILDES corpus, or the parts you need, should be manually
  566. downloaded from https://childes.talkbank.org/data-xml/ and saved at
  567. [NLTK_Data_Dir]/corpora/childes/
  568. Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
  569. demo('/path/to/childes/data-xml/Eng-USA/")
  570. """
  571. )
  572. # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
  573. # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
  574. ##this fails
  575. # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
  576. if __name__ == "__main__":
  577. demo()