arlstem.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Natural Language Toolkit: ARLSTem Stemmer
  4. #
  5. # Copyright (C) 2001-2020 NLTK Project
  6. #
  7. # Author: Kheireddine Abainia (x-programer) <k.abainia@gmail.com>
  8. # Algorithms: Kheireddine Abainia <k.abainia@gmail.com>
  9. # Siham Ouamour
  10. # Halim Sayoud
  11. # URL: <http://nltk.org/>
  12. # For license information, see LICENSE.TXT
  13. """
  14. ARLSTem Arabic Stemmer
  15. The details about the implementation of this algorithm are described in:
  16. K. Abainia, S. Ouamour and H. Sayoud, A Novel Robust Arabic Light Stemmer ,
  17. Journal of Experimental & Theoretical Artificial Intelligence (JETAI'17),
  18. Vol. 29, No. 3, 2017, pp. 557-573.
  19. The ARLSTem is a light Arabic stemmer that is based on removing the affixes
  20. from the word (i.e. prefixes, suffixes and infixes). It was evaluated and
  21. compared to several other stemmers using Paice's parameters (under-stemming
  22. index, over-stemming index and stemming weight), and the results showed that
  23. ARLSTem is promising and producing high performances. This stemmer is not
  24. based on any dictionary and can be used on-line effectively.
  25. """
  26. import re
  27. from nltk.stem.api import StemmerI
  28. class ARLSTem(StemmerI):
  29. """
  30. ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary.
  31. Department of Telecommunication & Information Processing. USTHB University,
  32. Algiers, Algeria.
  33. ARLSTem.stem(token) returns the Arabic stem for the input token.
  34. The ARLSTem Stemmer requires that all tokens are encoded using Unicode
  35. encoding.
  36. """
  37. def __init__(self):
  38. # different Alif with hamza
  39. self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]")
  40. self.re_alifMaqsura = re.compile(r"[\u0649]")
  41. self.re_diacritics = re.compile(r"[\u064B-\u065F]")
  42. # Alif Laam, Laam Laam, Fa Laam, Fa Ba
  43. self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"]
  44. # Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam
  45. self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"]
  46. # Fa Laam Laam, Waaw Laam Laam
  47. self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"]
  48. # Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam
  49. self.pr4 = [
  50. "\u0641\u0628\u0627\u0644",
  51. "\u0648\u0628\u0627\u0644",
  52. "\u0641\u0643\u0627\u0644",
  53. ]
  54. # Kaf Yaa, Kaf Miim
  55. self.su2 = ["\u0643\u064A", "\u0643\u0645"]
  56. # Ha Alif, Ha Miim
  57. self.su22 = ["\u0647\u0627", "\u0647\u0645"]
  58. # Kaf Miim Alif, Kaf Noon Shadda
  59. self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"]
  60. # Ha Miim Alif, Ha Noon Shadda
  61. self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"]
  62. # Alif Noon, Ya Noon, Waaw Noon
  63. self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"]
  64. # Taa Alif Noon, Taa Ya Noon
  65. self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"]
  66. # Alif Noon, Waaw Noon
  67. self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"]
  68. # Siin Taa, Siin Yaa
  69. self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"]
  70. # Siin Alif, Siin Noon
  71. self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"]
  72. # Lam Noon, Lam Taa, Lam Yaa, Lam Hamza
  73. self.verb_pr33 = [
  74. "\u0644\u0646",
  75. "\u0644\u062A",
  76. "\u0644\u064A",
  77. "\u0644\u0623",
  78. ]
  79. # Taa Miim Alif, Taa Noon Shadda
  80. self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"]
  81. # Noon Alif, Taa Miim, Taa Alif, Waaw Alif
  82. self.verb_suf2 = [
  83. "\u0646\u0627",
  84. "\u062A\u0645",
  85. "\u062A\u0627",
  86. "\u0648\u0627",
  87. ]
  88. # Taa, Alif, Noon
  89. self.verb_suf1 = ["\u062A", "\u0627", "\u0646"]
  90. def stem(self, token):
  91. """
  92. call this function to get the word's stem based on ARLSTem .
  93. """
  94. try:
  95. if token is None:
  96. raise ValueError(
  97. "The word could not be stemmed, because \
  98. it is empty !"
  99. )
  100. # remove Arabic diacritics and replace some letters with others
  101. token = self.norm(token)
  102. # strip common prefixes of the nouns
  103. pre = self.pref(token)
  104. if pre is not None:
  105. token = pre
  106. # strip the suffixes which are common to nouns and verbs
  107. token = self.suff(token)
  108. # transform a plural noun to a singular noun
  109. ps = self.plur2sing(token)
  110. if ps is None:
  111. # transform from the feminine form to the masculine form
  112. fm = self.fem2masc(token)
  113. if fm is not None:
  114. return fm
  115. else:
  116. if pre is None: # if the prefixes are not stripped
  117. # strip the verb prefixes and suffixes
  118. return self.verb(token)
  119. else:
  120. return ps
  121. return token
  122. except ValueError as e:
  123. print(e)
  124. def norm(self, token):
  125. """
  126. normalize the word by removing diacritics, replacing hamzated Alif
  127. with Alif replacing AlifMaqsura with Yaa and removing Waaw at the
  128. beginning.
  129. """
  130. # strip Arabic diacritics
  131. token = self.re_diacritics.sub("", token)
  132. # replace Hamzated Alif with Alif bare
  133. token = self.re_hamzated_alif.sub("\u0627", token)
  134. # replace alifMaqsura with Yaa
  135. token = self.re_alifMaqsura.sub("\u064A", token)
  136. # strip the Waaw from the word beginning if the remaining is 3 letters
  137. # at least
  138. if token.startswith("\u0648") and len(token) > 3:
  139. token = token[1:]
  140. return token
  141. def pref(self, token):
  142. """
  143. remove prefixes from the words' beginning.
  144. """
  145. if len(token) > 5:
  146. for p3 in self.pr3:
  147. if token.startswith(p3):
  148. return token[3:]
  149. if len(token) > 6:
  150. for p4 in self.pr4:
  151. if token.startswith(p4):
  152. return token[4:]
  153. if len(token) > 5:
  154. for p3 in self.pr32:
  155. if token.startswith(p3):
  156. return token[3:]
  157. if len(token) > 4:
  158. for p2 in self.pr2:
  159. if token.startswith(p2):
  160. return token[2:]
  161. def suff(self, token):
  162. """
  163. remove suffixes from the word's end.
  164. """
  165. if token.endswith("\u0643") and len(token) > 3:
  166. return token[:-1]
  167. if len(token) > 4:
  168. for s2 in self.su2:
  169. if token.endswith(s2):
  170. return token[:-2]
  171. if len(token) > 5:
  172. for s3 in self.su3:
  173. if token.endswith(s3):
  174. return token[:-3]
  175. if token.endswith("\u0647") and len(token) > 3:
  176. token = token[:-1]
  177. return token
  178. if len(token) > 4:
  179. for s2 in self.su22:
  180. if token.endswith(s2):
  181. return token[:-2]
  182. if len(token) > 5:
  183. for s3 in self.su32:
  184. if token.endswith(s3):
  185. return token[:-3]
  186. if token.endswith("\u0646\u0627") and len(token) > 4:
  187. return token[:-2]
  188. return token
  189. def fem2masc(self, token):
  190. """
  191. transform the word from the feminine form to the masculine form.
  192. """
  193. if token.endswith("\u0629") and len(token) > 3:
  194. return token[:-1]
  195. def plur2sing(self, token):
  196. """
  197. transform the word from the plural form to the singular form.
  198. """
  199. if len(token) > 4:
  200. for ps2 in self.pl_si2:
  201. if token.endswith(ps2):
  202. return token[:-2]
  203. if len(token) > 5:
  204. for ps3 in self.pl_si3:
  205. if token.endswith(ps3):
  206. return token[:-3]
  207. if len(token) > 3 and token.endswith("\u0627\u062A"):
  208. return token[:-2]
  209. if len(token) > 3 and token.startswith("\u0627") and token[2] == "\u0627":
  210. return token[:2] + token[3:]
  211. if len(token) > 4 and token.startswith("\u0627") and token[-2] == "\u0627":
  212. return token[1:-2] + token[-1]
  213. def verb(self, token):
  214. """
  215. stem the verb prefixes and suffixes or both
  216. """
  217. vb = self.verb_t1(token)
  218. if vb is not None:
  219. return vb
  220. vb = self.verb_t2(token)
  221. if vb is not None:
  222. return vb
  223. vb = self.verb_t3(token)
  224. if vb is not None:
  225. return vb
  226. vb = self.verb_t4(token)
  227. if vb is not None:
  228. return vb
  229. vb = self.verb_t5(token)
  230. if vb is not None:
  231. return vb
  232. return self.verb_t6(token)
  233. def verb_t1(self, token):
  234. """
  235. stem the present prefixes and suffixes
  236. """
  237. if len(token) > 5 and token.startswith("\u062A"): # Taa
  238. for s2 in self.pl_si2:
  239. if token.endswith(s2):
  240. return token[1:-2]
  241. if len(token) > 5 and token.startswith("\u064A"): # Yaa
  242. for s2 in self.verb_su2:
  243. if token.endswith(s2):
  244. return token[1:-2]
  245. if len(token) > 4 and token.startswith("\u0627"): # Alif
  246. # Waaw Alif
  247. if len(token) > 5 and token.endswith("\u0648\u0627"):
  248. return token[1:-2]
  249. # Yaa
  250. if token.endswith("\u064A"):
  251. return token[1:-1]
  252. # Alif
  253. if token.endswith("\u0627"):
  254. return token[1:-1]
  255. # Noon
  256. if token.endswith("\u0646"):
  257. return token[1:-1]
  258. # ^Yaa, Noon$
  259. if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"):
  260. return token[1:-1]
  261. # ^Taa, Noon$
  262. if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"):
  263. return token[1:-1]
  264. def verb_t2(self, token):
  265. """
  266. stem the future prefixes and suffixes
  267. """
  268. if len(token) > 6:
  269. for s2 in self.pl_si2:
  270. # ^Siin Taa
  271. if token.startswith(self.verb_pr2[0]) and token.endswith(s2):
  272. return token[2:-2]
  273. # ^Siin Yaa, Alif Noon$
  274. if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[0]):
  275. return token[2:-2]
  276. # ^Siin Yaa, Waaw Noon$
  277. if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[2]):
  278. return token[2:-2]
  279. # ^Siin Taa, Noon$
  280. if (
  281. len(token) > 5
  282. and token.startswith(self.verb_pr2[0])
  283. and token.endswith("\u0646")
  284. ):
  285. return token[2:-1]
  286. # ^Siin Yaa, Noon$
  287. if (
  288. len(token) > 5
  289. and token.startswith(self.verb_pr2[1])
  290. and token.endswith("\u0646")
  291. ):
  292. return token[2:-1]
  293. def verb_t3(self, token):
  294. """
  295. stem the present suffixes
  296. """
  297. if len(token) > 5:
  298. for su3 in self.verb_suf3:
  299. if token.endswith(su3):
  300. return token[:-3]
  301. if len(token) > 4:
  302. for su2 in self.verb_suf2:
  303. if token.endswith(su2):
  304. return token[:-2]
  305. if len(token) > 3:
  306. for su1 in self.verb_suf1:
  307. if token.endswith(su1):
  308. return token[:-1]
  309. def verb_t4(self, token):
  310. """
  311. stem the present prefixes
  312. """
  313. if len(token) > 3:
  314. for pr1 in self.verb_suf1:
  315. if token.startswith(pr1):
  316. return token[1:]
  317. if token.startswith("\u064A"):
  318. return token[1:]
  319. def verb_t5(self, token):
  320. """
  321. stem the future prefixes
  322. """
  323. if len(token) > 4:
  324. for pr2 in self.verb_pr22:
  325. if token.startswith(pr2):
  326. return token[2:]
  327. for pr2 in self.verb_pr2:
  328. if token.startswith(pr2):
  329. return token[2:]
  330. return token
  331. def verb_t6(self, token):
  332. """
  333. stem the order prefixes
  334. """
  335. if len(token) > 4:
  336. for pr3 in self.verb_pr33:
  337. if token.startswith(pr3):
  338. return token[2:]
  339. return token