lexicon.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. # Natural Language Toolkit: Combinatory Categorial Grammar
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. CCG Lexicons
  9. """
  10. import re
  11. from collections import defaultdict
  12. from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
  13. from nltk.internals import deprecated
  14. from nltk.sem.logic import Expression
  15. # ------------
  16. # Regular expressions used for parsing components of the lexicon
  17. # ------------
  18. # Parses a primitive category and subscripts
  19. PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""")
  20. # Separates the next primitive category from the remainder of the
  21. # string
  22. NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""")
  23. # Separates the next application operator from the remainder
  24. APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""")
  25. # Parses the definition of the right-hand side (rhs) of either a word or a family
  26. LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE)
  27. # Parses the right hand side that contains category and maybe semantic predicate
  28. RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE)
  29. # Parses the semantic predicate
  30. SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE)
  31. # Strips comments from a line
  32. COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""")
  33. class Token(object):
  34. """
  35. Class representing a token.
  36. token => category {semantics}
  37. e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
  38. * `token` (string)
  39. * `categ` (string)
  40. * `semantics` (Expression)
  41. """
  42. def __init__(self, token, categ, semantics=None):
  43. self._token = token
  44. self._categ = categ
  45. self._semantics = semantics
  46. def categ(self):
  47. return self._categ
  48. def semantics(self):
  49. return self._semantics
  50. def __str__(self):
  51. semantics_str = ""
  52. if self._semantics is not None:
  53. semantics_str = " {" + str(self._semantics) + "}"
  54. return "" + str(self._categ) + semantics_str
  55. def __cmp__(self, other):
  56. if not isinstance(other, Token):
  57. return -1
  58. return cmp((self._categ, self._semantics), other.categ(), other.semantics())
  59. class CCGLexicon(object):
  60. """
  61. Class representing a lexicon for CCG grammars.
  62. * `primitives`: The list of primitive categories for the lexicon
  63. * `families`: Families of categories
  64. * `entries`: A mapping of words to possible categories
  65. """
  66. def __init__(self, start, primitives, families, entries):
  67. self._start = PrimitiveCategory(start)
  68. self._primitives = primitives
  69. self._families = families
  70. self._entries = entries
  71. def categories(self, word):
  72. """
  73. Returns all the possible categories for a word
  74. """
  75. return self._entries[word]
  76. def start(self):
  77. """
  78. Return the target category for the parser
  79. """
  80. return self._start
  81. def __str__(self):
  82. """
  83. String representation of the lexicon. Used for debugging.
  84. """
  85. string = ""
  86. first = True
  87. for ident in sorted(self._entries):
  88. if not first:
  89. string = string + "\n"
  90. string = string + ident + " => "
  91. first = True
  92. for cat in self._entries[ident]:
  93. if not first:
  94. string = string + " | "
  95. else:
  96. first = False
  97. string = string + "%s" % cat
  98. return string
  99. # -----------
  100. # Parsing lexicons
  101. # -----------
  102. def matchBrackets(string):
  103. """
  104. Separate the contents matching the first set of brackets from the rest of
  105. the input.
  106. """
  107. rest = string[1:]
  108. inside = "("
  109. while rest != "" and not rest.startswith(")"):
  110. if rest.startswith("("):
  111. (part, rest) = matchBrackets(rest)
  112. inside = inside + part
  113. else:
  114. inside = inside + rest[0]
  115. rest = rest[1:]
  116. if rest.startswith(")"):
  117. return (inside + ")", rest[1:])
  118. raise AssertionError("Unmatched bracket in string '" + string + "'")
  119. def nextCategory(string):
  120. """
  121. Separate the string for the next portion of the category from the rest
  122. of the string
  123. """
  124. if string.startswith("("):
  125. return matchBrackets(string)
  126. return NEXTPRIM_RE.match(string).groups()
  127. def parseApplication(app):
  128. """
  129. Parse an application operator
  130. """
  131. return Direction(app[0], app[1:])
  132. def parseSubscripts(subscr):
  133. """
  134. Parse the subscripts for a primitive category
  135. """
  136. if subscr:
  137. return subscr[1:-1].split(",")
  138. return []
  139. def parsePrimitiveCategory(chunks, primitives, families, var):
  140. """
  141. Parse a primitive category
  142. If the primitive is the special category 'var', replace it with the
  143. correct `CCGVar`.
  144. """
  145. if chunks[0] == "var":
  146. if chunks[1] is None:
  147. if var is None:
  148. var = CCGVar()
  149. return (var, var)
  150. catstr = chunks[0]
  151. if catstr in families:
  152. (cat, cvar) = families[catstr]
  153. if var is None:
  154. var = cvar
  155. else:
  156. cat = cat.substitute([(cvar, var)])
  157. return (cat, var)
  158. if catstr in primitives:
  159. subscrs = parseSubscripts(chunks[1])
  160. return (PrimitiveCategory(catstr, subscrs), var)
  161. raise AssertionError(
  162. "String '" + catstr + "' is neither a family nor primitive category."
  163. )
  164. def augParseCategory(line, primitives, families, var=None):
  165. """
  166. Parse a string representing a category, and returns a tuple with
  167. (possibly) the CCG variable for the category
  168. """
  169. (cat_string, rest) = nextCategory(line)
  170. if cat_string.startswith("("):
  171. (res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
  172. else:
  173. (res, var) = parsePrimitiveCategory(
  174. PRIM_RE.match(cat_string).groups(), primitives, families, var
  175. )
  176. while rest != "":
  177. app = APP_RE.match(rest).groups()
  178. direction = parseApplication(app[0:3])
  179. rest = app[3]
  180. (cat_string, rest) = nextCategory(rest)
  181. if cat_string.startswith("("):
  182. (arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
  183. else:
  184. (arg, var) = parsePrimitiveCategory(
  185. PRIM_RE.match(cat_string).groups(), primitives, families, var
  186. )
  187. res = FunctionalCategory(res, arg, direction)
  188. return (res, var)
  189. def fromstring(lex_str, include_semantics=False):
  190. """
  191. Convert string representation into a lexicon for CCGs.
  192. """
  193. CCGVar.reset_id()
  194. primitives = []
  195. families = {}
  196. entries = defaultdict(list)
  197. for line in lex_str.splitlines():
  198. # Strip comments and leading/trailing whitespace.
  199. line = COMMENTS_RE.match(line).groups()[0].strip()
  200. if line == "":
  201. continue
  202. if line.startswith(":-"):
  203. # A line of primitive categories.
  204. # The first one is the target category
  205. # ie, :- S, N, NP, VP
  206. primitives = primitives + [
  207. prim.strip() for prim in line[2:].strip().split(",")
  208. ]
  209. else:
  210. # Either a family definition, or a word definition
  211. (ident, sep, rhs) = LEX_RE.match(line).groups()
  212. (catstr, semantics_str) = RHS_RE.match(rhs).groups()
  213. (cat, var) = augParseCategory(catstr, primitives, families)
  214. if sep == "::":
  215. # Family definition
  216. # ie, Det :: NP/N
  217. families[ident] = (cat, var)
  218. else:
  219. semantics = None
  220. if include_semantics is True:
  221. if semantics_str is None:
  222. raise AssertionError(
  223. line
  224. + " must contain semantics because include_semantics is set to True"
  225. )
  226. else:
  227. semantics = Expression.fromstring(
  228. SEMANTICS_RE.match(semantics_str).groups()[0]
  229. )
  230. # Word definition
  231. # ie, which => (N\N)/(S/NP)
  232. entries[ident].append(Token(ident, cat, semantics))
  233. return CCGLexicon(primitives[0], primitives, families, entries)
  234. @deprecated("Use fromstring() instead.")
  235. def parseLexicon(lex_str):
  236. return fromstring(lex_str)
  237. openccg_tinytiny = fromstring(
  238. """
  239. # Rather minimal lexicon based on the openccg `tinytiny' grammar.
  240. # Only incorporates a subset of the morphological subcategories, however.
  241. :- S,NP,N # Primitive categories
  242. Det :: NP/N # Determiners
  243. Pro :: NP
  244. IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
  245. IntransVpl :: S\\NP[pl] # Plural
  246. TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
  247. TransVpl :: S\\NP[pl]/NP # Plural
  248. the => NP[sg]/N[sg]
  249. the => NP[pl]/N[pl]
  250. I => Pro
  251. me => Pro
  252. we => Pro
  253. us => Pro
  254. book => N[sg]
  255. books => N[pl]
  256. peach => N[sg]
  257. peaches => N[pl]
  258. policeman => N[sg]
  259. policemen => N[pl]
  260. boy => N[sg]
  261. boys => N[pl]
  262. sleep => IntransVsg
  263. sleep => IntransVpl
  264. eat => IntransVpl
  265. eat => TransVpl
  266. eats => IntransVsg
  267. eats => TransVsg
  268. see => TransVpl
  269. sees => TransVsg
  270. """
  271. )