snowball.py 174 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Natural Language Toolkit: Snowball Stemmer
  4. #
  5. # Copyright (C) 2001-2020 NLTK Project
  6. # Author: Peter Michael Stahl <pemistahl@gmail.com>
  7. # Peter Ljunglof <peter.ljunglof@heatherleaf.se> (revisions)
  8. # Lakhdar Benzahia <lakhdar.benzahia@gmail.com> (co-writer)
  9. # Assem Chelli <assem.ch@gmail.com> (reviewer arabicstemmer)
  10. # Abdelkrim Aries <ab_aries@esi.dz> (reviewer arabicstemmer)
  11. # Algorithms: Dr Martin Porter <martin@tartarus.org>
  12. # Assem Chelli <assem.ch@gmail.com> arabic stemming algorithm
  13. # Benzahia Lakhdar <lakhdar.benzahia@gmail.com>
  14. # URL: <http://nltk.org/>
  15. # For license information, see LICENSE.TXT
  16. """
  17. Snowball stemmers
  18. This module provides a port of the Snowball stemmers
  19. developed by Martin Porter.
  20. There is also a demo function: `snowball.demo()`.
  21. """
  22. import re
  23. from nltk.corpus import stopwords
  24. from nltk.stem import porter
  25. from nltk.stem.util import suffix_replace, prefix_replace
  26. from nltk.stem.api import StemmerI
  27. class SnowballStemmer(StemmerI):
  28. """
  29. Snowball Stemmer
  30. The following languages are supported:
  31. Arabic, Danish, Dutch, English, Finnish, French, German,
  32. Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian,
  33. Spanish and Swedish.
  34. The algorithm for English is documented here:
  35. Porter, M. \"An algorithm for suffix stripping.\"
  36. Program 14.3 (1980): 130-137.
  37. The algorithms have been developed by Martin Porter.
  38. These stemmers are called Snowball, because Porter created
  39. a programming language with this name for creating
  40. new stemming algorithms. There is more information available
  41. at http://snowball.tartarus.org/
  42. The stemmer is invoked as shown below:
  43. >>> from nltk.stem import SnowballStemmer
  44. >>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported
  45. arabic danish dutch english finnish french german hungarian
  46. italian norwegian porter portuguese romanian russian
  47. spanish swedish
  48. >>> stemmer = SnowballStemmer("german") # Choose a language
  49. >>> stemmer.stem("Autobahnen") # Stem a word
  50. 'autobahn'
  51. Invoking the stemmers that way is useful if you do not know the
  52. language to be stemmed at runtime. Alternatively, if you already know
  53. the language, then you can invoke the language specific stemmer directly:
  54. >>> from nltk.stem.snowball import GermanStemmer
  55. >>> stemmer = GermanStemmer()
  56. >>> stemmer.stem("Autobahnen")
  57. 'autobahn'
  58. :param language: The language whose subclass is instantiated.
  59. :type language: str or unicode
  60. :param ignore_stopwords: If set to True, stopwords are
  61. not stemmed and returned unchanged.
  62. Set to False by default.
  63. :type ignore_stopwords: bool
  64. :raise ValueError: If there is no stemmer for the specified
  65. language, a ValueError is raised.
  66. """
  67. languages = (
  68. "arabic",
  69. "danish",
  70. "dutch",
  71. "english",
  72. "finnish",
  73. "french",
  74. "german",
  75. "hungarian",
  76. "italian",
  77. "norwegian",
  78. "porter",
  79. "portuguese",
  80. "romanian",
  81. "russian",
  82. "spanish",
  83. "swedish",
  84. )
  85. def __init__(self, language, ignore_stopwords=False):
  86. if language not in self.languages:
  87. raise ValueError("The language '{0}' is not supported.".format(language))
  88. stemmerclass = globals()[language.capitalize() + "Stemmer"]
  89. self.stemmer = stemmerclass(ignore_stopwords)
  90. self.stem = self.stemmer.stem
  91. self.stopwords = self.stemmer.stopwords
  92. def stem(self, token):
  93. return self.stemmer.stem(self, token)
  94. class _LanguageSpecificStemmer(StemmerI):
  95. """
  96. This helper subclass offers the possibility
  97. to invoke a specific stemmer directly.
  98. This is useful if you already know the language to be stemmed at runtime.
  99. Create an instance of the Snowball stemmer.
  100. :param ignore_stopwords: If set to True, stopwords are
  101. not stemmed and returned unchanged.
  102. Set to False by default.
  103. :type ignore_stopwords: bool
  104. """
  105. def __init__(self, ignore_stopwords=False):
  106. # The language is the name of the class, minus the final "Stemmer".
  107. language = type(self).__name__.lower()
  108. if language.endswith("stemmer"):
  109. language = language[:-7]
  110. self.stopwords = set()
  111. if ignore_stopwords:
  112. try:
  113. for word in stopwords.words(language):
  114. self.stopwords.add(word)
  115. except IOError:
  116. raise ValueError(
  117. "{!r} has no list of stopwords. Please set"
  118. " 'ignore_stopwords' to 'False'.".format(self)
  119. )
  120. def __repr__(self):
  121. """
  122. Print out the string representation of the respective class.
  123. """
  124. return "<{0}>".format(type(self).__name__)
  125. class PorterStemmer(_LanguageSpecificStemmer, porter.PorterStemmer):
  126. """
  127. A word stemmer based on the original Porter stemming algorithm.
  128. Porter, M. \"An algorithm for suffix stripping.\"
  129. Program 14.3 (1980): 130-137.
  130. A few minor modifications have been made to Porter's basic
  131. algorithm. See the source code of the module
  132. nltk.stem.porter for more information.
  133. """
  134. def __init__(self, ignore_stopwords=False):
  135. _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
  136. porter.PorterStemmer.__init__(self)
  137. class _ScandinavianStemmer(_LanguageSpecificStemmer):
  138. """
  139. This subclass encapsulates a method for defining the string region R1.
  140. It is used by the Danish, Norwegian, and Swedish stemmer.
  141. """
  142. def _r1_scandinavian(self, word, vowels):
  143. """
  144. Return the region R1 that is used by the Scandinavian stemmers.
  145. R1 is the region after the first non-vowel following a vowel,
  146. or is the null region at the end of the word if there is no
  147. such non-vowel. But then R1 is adjusted so that the region
  148. before it contains at least three letters.
  149. :param word: The word whose region R1 is determined.
  150. :type word: str or unicode
  151. :param vowels: The vowels of the respective language that are
  152. used to determine the region R1.
  153. :type vowels: unicode
  154. :return: the region R1 for the respective word.
  155. :rtype: unicode
  156. :note: This helper method is invoked by the respective stem method of
  157. the subclasses DanishStemmer, NorwegianStemmer, and
  158. SwedishStemmer. It is not to be invoked directly!
  159. """
  160. r1 = ""
  161. for i in range(1, len(word)):
  162. if word[i] not in vowels and word[i - 1] in vowels:
  163. if 3 > len(word[: i + 1]) > 0:
  164. r1 = word[3:]
  165. elif len(word[: i + 1]) >= 3:
  166. r1 = word[i + 1 :]
  167. else:
  168. return word
  169. break
  170. return r1
  171. class _StandardStemmer(_LanguageSpecificStemmer):
  172. """
  173. This subclass encapsulates two methods for defining the standard versions
  174. of the string regions R1, R2, and RV.
  175. """
  176. def _r1r2_standard(self, word, vowels):
  177. """
  178. Return the standard interpretations of the string regions R1 and R2.
  179. R1 is the region after the first non-vowel following a vowel,
  180. or is the null region at the end of the word if there is no
  181. such non-vowel.
  182. R2 is the region after the first non-vowel following a vowel
  183. in R1, or is the null region at the end of the word if there
  184. is no such non-vowel.
  185. :param word: The word whose regions R1 and R2 are determined.
  186. :type word: str or unicode
  187. :param vowels: The vowels of the respective language that are
  188. used to determine the regions R1 and R2.
  189. :type vowels: unicode
  190. :return: (r1,r2), the regions R1 and R2 for the respective word.
  191. :rtype: tuple
  192. :note: This helper method is invoked by the respective stem method of
  193. the subclasses DutchStemmer, FinnishStemmer,
  194. FrenchStemmer, GermanStemmer, ItalianStemmer,
  195. PortugueseStemmer, RomanianStemmer, and SpanishStemmer.
  196. It is not to be invoked directly!
  197. :note: A detailed description of how to define R1 and R2
  198. can be found at http://snowball.tartarus.org/texts/r1r2.html
  199. """
  200. r1 = ""
  201. r2 = ""
  202. for i in range(1, len(word)):
  203. if word[i] not in vowels and word[i - 1] in vowels:
  204. r1 = word[i + 1 :]
  205. break
  206. for i in range(1, len(r1)):
  207. if r1[i] not in vowels and r1[i - 1] in vowels:
  208. r2 = r1[i + 1 :]
  209. break
  210. return (r1, r2)
  211. def _rv_standard(self, word, vowels):
  212. """
  213. Return the standard interpretation of the string region RV.
  214. If the second letter is a consonant, RV is the region after the
  215. next following vowel. If the first two letters are vowels, RV is
  216. the region after the next following consonant. Otherwise, RV is
  217. the region after the third letter.
  218. :param word: The word whose region RV is determined.
  219. :type word: str or unicode
  220. :param vowels: The vowels of the respective language that are
  221. used to determine the region RV.
  222. :type vowels: unicode
  223. :return: the region RV for the respective word.
  224. :rtype: unicode
  225. :note: This helper method is invoked by the respective stem method of
  226. the subclasses ItalianStemmer, PortugueseStemmer,
  227. RomanianStemmer, and SpanishStemmer. It is not to be
  228. invoked directly!
  229. """
  230. rv = ""
  231. if len(word) >= 2:
  232. if word[1] not in vowels:
  233. for i in range(2, len(word)):
  234. if word[i] in vowels:
  235. rv = word[i + 1 :]
  236. break
  237. elif word[0] in vowels and word[1] in vowels:
  238. for i in range(2, len(word)):
  239. if word[i] not in vowels:
  240. rv = word[i + 1 :]
  241. break
  242. else:
  243. rv = word[3:]
  244. return rv
  245. class ArabicStemmer(_StandardStemmer):
  246. """
  247. https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm)
  248. The Snowball Arabic light Stemmer
  249. Algorithm : Assem Chelli
  250. Abdelkrim Aries
  251. Lakhdar Benzahia
  252. Nltk Version Author : Lakhdar Benzahia
  253. """
  254. # Normalize_pre stes
  255. __vocalization = re.compile(
  256. r"[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]"
  257. ) # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ
  258. __kasheeda = re.compile(r"[\u0640]") # ـ tatweel/kasheeda
  259. __arabic_punctuation_marks = re.compile(r"[\u060C-\u061B-\u061F]") # ؛ ، ؟
  260. # Normalize_post
  261. __last_hamzat = ("\u0623", "\u0625", "\u0622", "\u0624", "\u0626") # أ، إ، آ، ؤ، ئ
  262. # normalize other hamza's
  263. __initial_hamzat = re.compile(r"^[\u0622\u0623\u0625]") # أ، إ، آ
  264. __waw_hamza = re.compile(r"[\u0624]") # ؤ
  265. __yeh_hamza = re.compile(r"[\u0626]") # ئ
  266. __alefat = re.compile(r"[\u0623\u0622\u0625]") # أ، إ، آ
  267. # Checks
  268. __checks1 = (
  269. "\u0643\u0627\u0644",
  270. "\u0628\u0627\u0644", # بال، كال
  271. "\u0627\u0644",
  272. "\u0644\u0644", # لل، ال
  273. )
  274. __checks2 = ("\u0629", "\u0627\u062a") # ة # female plural ات
  275. # Suffixes
  276. __suffix_noun_step1a = (
  277. "\u064a",
  278. "\u0643",
  279. "\u0647", # ي، ك، ه
  280. "\u0646\u0627",
  281. "\u0643\u0645",
  282. "\u0647\u0627",
  283. "\u0647\u0646",
  284. "\u0647\u0645", # نا، كم، ها، هن، هم
  285. "\u0643\u0645\u0627",
  286. "\u0647\u0645\u0627", # كما، هما
  287. )
  288. __suffix_noun_step1b = "\u0646" # ن
  289. __suffix_noun_step2a = ("\u0627", "\u064a", "\u0648") # ا، ي، و
  290. __suffix_noun_step2b = "\u0627\u062a" # ات
  291. __suffix_noun_step2c1 = "\u062a" # ت
  292. __suffix_noun_step2c2 = "\u0629" # ة
  293. __suffix_noun_step3 = "\u064a" # ي
  294. __suffix_verb_step1 = (
  295. "\u0647",
  296. "\u0643", # ه، ك
  297. "\u0646\u064a",
  298. "\u0646\u0627",
  299. "\u0647\u0627",
  300. "\u0647\u0645", # ني، نا، ها، هم
  301. "\u0647\u0646",
  302. "\u0643\u0645",
  303. "\u0643\u0646", # هن، كم، كن
  304. "\u0647\u0645\u0627",
  305. "\u0643\u0645\u0627",
  306. "\u0643\u0645\u0648", # هما، كما، كمو
  307. )
  308. __suffix_verb_step2a = (
  309. "\u062a",
  310. "\u0627",
  311. "\u0646",
  312. "\u064a", # ت، ا، ن، ي
  313. "\u0646\u0627",
  314. "\u062a\u0627",
  315. "\u062a\u0646", # نا، تا، تن Past
  316. "\u0627\u0646",
  317. "\u0648\u0646",
  318. "\u064a\u0646", # ان، هن، ين Present
  319. "\u062a\u0645\u0627", # تما
  320. )
  321. __suffix_verb_step2b = ("\u0648\u0627", "\u062a\u0645") # وا، تم
  322. __suffix_verb_step2c = ("\u0648", "\u062a\u0645\u0648") # و # تمو
  323. __suffix_all_alef_maqsura = "\u0649" # ى
  324. # Prefixes
  325. __prefix_step1 = (
  326. "\u0623", # أ
  327. "\u0623\u0623",
  328. "\u0623\u0622",
  329. "\u0623\u0624",
  330. "\u0623\u0627",
  331. "\u0623\u0625", # أأ، أآ، أؤ، أا، أإ
  332. )
  333. __prefix_step2a = ("\u0641\u0627\u0644", "\u0648\u0627\u0644") # فال، وال
  334. __prefix_step2b = ("\u0641", "\u0648") # ف، و
  335. __prefix_step3a_noun = (
  336. "\u0627\u0644",
  337. "\u0644\u0644", # لل، ال
  338. "\u0643\u0627\u0644",
  339. "\u0628\u0627\u0644", # بال، كال
  340. )
  341. __prefix_step3b_noun = (
  342. "\u0628",
  343. "\u0643",
  344. "\u0644", # ب، ك، ل
  345. "\u0628\u0628",
  346. "\u0643\u0643", # بب، كك
  347. )
  348. __prefix_step3_verb = (
  349. "\u0633\u064a",
  350. "\u0633\u062a",
  351. "\u0633\u0646",
  352. "\u0633\u0623",
  353. ) # سي، ست، سن، سأ
  354. __prefix_step4_verb = (
  355. "\u064a\u0633\u062a",
  356. "\u0646\u0633\u062a",
  357. "\u062a\u0633\u062a",
  358. ) # يست، نست، تست
  359. # Suffixes added due to Conjugation Verbs
  360. __conjugation_suffix_verb_1 = ("\u0647", "\u0643") # ه، ك
  361. __conjugation_suffix_verb_2 = (
  362. "\u0646\u064a",
  363. "\u0646\u0627",
  364. "\u0647\u0627", # ني، نا، ها
  365. "\u0647\u0645",
  366. "\u0647\u0646",
  367. "\u0643\u0645", # هم، هن، كم
  368. "\u0643\u0646", # كن
  369. )
  370. __conjugation_suffix_verb_3 = (
  371. "\u0647\u0645\u0627",
  372. "\u0643\u0645\u0627",
  373. "\u0643\u0645\u0648",
  374. ) # هما، كما، كمو
  375. __conjugation_suffix_verb_4 = ("\u0627", "\u0646", "\u064a") # ا، ن، ي
  376. __conjugation_suffix_verb_past = (
  377. "\u0646\u0627",
  378. "\u062a\u0627",
  379. "\u062a\u0646",
  380. ) # نا، تا، تن
  381. __conjugation_suffix_verb_present = (
  382. "\u0627\u0646",
  383. "\u0648\u0646",
  384. "\u064a\u0646",
  385. ) # ان، ون، ين
  386. # Suffixes added due to derivation Names
  387. __conjugation_suffix_noun_1 = ("\u064a", "\u0643", "\u0647") # ي، ك، ه
  388. __conjugation_suffix_noun_2 = (
  389. "\u0646\u0627",
  390. "\u0643\u0645", # نا، كم
  391. "\u0647\u0627",
  392. "\u0647\u0646",
  393. "\u0647\u0645", # ها، هن، هم
  394. )
  395. __conjugation_suffix_noun_3 = (
  396. "\u0643\u0645\u0627",
  397. "\u0647\u0645\u0627",
  398. ) # كما، هما
  399. # Prefixes added due to derivation Names
  400. __prefixes1 = ("\u0648\u0627", "\u0641\u0627") # فا، وا
  401. __articles_3len = ("\u0643\u0627\u0644", "\u0628\u0627\u0644") # بال كال
  402. __articles_2len = ("\u0627\u0644", "\u0644\u0644") # ال لل
  403. # Prepositions letters
  404. __prepositions1 = ("\u0643", "\u0644") # ك، ل
  405. __prepositions2 = ("\u0628\u0628", "\u0643\u0643") # بب، كك
  406. is_verb = True
  407. is_noun = True
  408. is_defined = False
  409. suffixes_verb_step1_success = False
  410. suffix_verb_step2a_success = False
  411. suffix_verb_step2b_success = False
  412. suffix_noun_step2c2_success = False
  413. suffix_noun_step1a_success = False
  414. suffix_noun_step2a_success = False
  415. suffix_noun_step2b_success = False
  416. suffixe_noun_step1b_success = False
  417. prefix_step2a_success = False
  418. prefix_step3a_noun_success = False
  419. prefix_step3b_noun_success = False
  420. def __normalize_pre(self, token):
  421. """
  422. :param token: string
  423. :return: normalized token type string
  424. """
  425. # strip diacritics
  426. token = self.__vocalization.sub("", token)
  427. # strip kasheeda
  428. token = self.__kasheeda.sub("", token)
  429. # strip punctuation marks
  430. token = self.__arabic_punctuation_marks.sub("", token)
  431. return token
  432. def __normalize_post(self, token):
  433. # normalize last hamza
  434. for hamza in self.__last_hamzat:
  435. if token.endswith(hamza):
  436. token = suffix_replace(token, hamza, "\u0621")
  437. break
  438. # normalize other hamzat
  439. token = self.__initial_hamzat.sub("\u0627", token)
  440. token = self.__waw_hamza.sub("\u0648", token)
  441. token = self.__yeh_hamza.sub("\u064a", token)
  442. token = self.__alefat.sub("\u0627", token)
  443. return token
  444. def __checks_1(self, token):
  445. for prefix in self.__checks1:
  446. if token.startswith(prefix):
  447. if prefix in self.__articles_3len and len(token) > 4:
  448. self.is_noun = True
  449. self.is_verb = False
  450. self.is_defined = True
  451. break
  452. if prefix in self.__articles_2len and len(token) > 3:
  453. self.is_noun = True
  454. self.is_verb = False
  455. self.is_defined = True
  456. break
  457. def __checks_2(self, token):
  458. for suffix in self.__checks2:
  459. if token.endswith(suffix):
  460. if suffix == "\u0629" and len(token) > 2:
  461. self.is_noun = True
  462. self.is_verb = False
  463. break
  464. if suffix == "\u0627\u062a" and len(token) > 3:
  465. self.is_noun = True
  466. self.is_verb = False
  467. break
  468. def __Suffix_Verb_Step1(self, token):
  469. for suffix in self.__suffix_verb_step1:
  470. if token.endswith(suffix):
  471. if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4:
  472. token = token[:-1]
  473. self.suffixes_verb_step1_success = True
  474. break
  475. if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5:
  476. token = token[:-2]
  477. self.suffixes_verb_step1_success = True
  478. break
  479. if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6:
  480. token = token[:-3]
  481. self.suffixes_verb_step1_success = True
  482. break
  483. return token
  484. def __Suffix_Verb_Step2a(self, token):
  485. for suffix in self.__suffix_verb_step2a:
  486. if token.endswith(suffix) and len(token) > 3:
  487. if suffix == "\u062a" and len(token) >= 4:
  488. token = token[:-1]
  489. self.suffix_verb_step2a_success = True
  490. break
  491. if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4:
  492. token = token[:-1]
  493. self.suffix_verb_step2a_success = True
  494. break
  495. if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5:
  496. token = token[:-2] # past
  497. self.suffix_verb_step2a_success = True
  498. break
  499. if suffix in self.__conjugation_suffix_verb_present and len(token) > 5:
  500. token = token[:-2] # present
  501. self.suffix_verb_step2a_success = True
  502. break
  503. if suffix == "\u062a\u0645\u0627" and len(token) >= 6:
  504. token = token[:-3]
  505. self.suffix_verb_step2a_success = True
  506. break
  507. return token
  508. def __Suffix_Verb_Step2c(self, token):
  509. for suffix in self.__suffix_verb_step2c:
  510. if token.endswith(suffix):
  511. if suffix == "\u062a\u0645\u0648" and len(token) >= 6:
  512. token = token[:-3]
  513. break
  514. if suffix == "\u0648" and len(token) >= 4:
  515. token = token[:-1]
  516. break
  517. return token
  518. def __Suffix_Verb_Step2b(self, token):
  519. for suffix in self.__suffix_verb_step2b:
  520. if token.endswith(suffix) and len(token) >= 5:
  521. token = token[:-2]
  522. self.suffix_verb_step2b_success = True
  523. break
  524. return token
  525. def __Suffix_Noun_Step2c2(self, token):
  526. for suffix in self.__suffix_noun_step2c2:
  527. if token.endswith(suffix) and len(token) >= 3:
  528. token = token[:-1]
  529. self.suffix_noun_step2c2_success = True
  530. break
  531. return token
  532. def __Suffix_Noun_Step1a(self, token):
  533. for suffix in self.__suffix_noun_step1a:
  534. if token.endswith(suffix):
  535. if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4:
  536. token = token[:-1]
  537. self.suffix_noun_step1a_success = True
  538. break
  539. if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5:
  540. token = token[:-2]
  541. self.suffix_noun_step1a_success = True
  542. break
  543. if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6:
  544. token = token[:-3]
  545. self.suffix_noun_step1a_success = True
  546. break
  547. return token
  548. def __Suffix_Noun_Step2a(self, token):
  549. for suffix in self.__suffix_noun_step2a:
  550. if token.endswith(suffix) and len(token) > 4:
  551. token = token[:-1]
  552. self.suffix_noun_step2a_success = True
  553. break
  554. return token
  555. def __Suffix_Noun_Step2b(self, token):
  556. for suffix in self.__suffix_noun_step2b:
  557. if token.endswith(suffix) and len(token) >= 5:
  558. token = token[:-2]
  559. self.suffix_noun_step2b_success = True
  560. break
  561. return token
  562. def __Suffix_Noun_Step2c1(self, token):
  563. for suffix in self.__suffix_noun_step2c1:
  564. if token.endswith(suffix) and len(token) >= 4:
  565. token = token[:-1]
  566. break
  567. return token
  568. def __Suffix_Noun_Step1b(self, token):
  569. for suffix in self.__suffix_noun_step1b:
  570. if token.endswith(suffix) and len(token) > 5:
  571. token = token[:-1]
  572. self.suffixe_noun_step1b_success = True
  573. break
  574. return token
  575. def __Suffix_Noun_Step3(self, token):
  576. for suffix in self.__suffix_noun_step3:
  577. if token.endswith(suffix) and len(token) >= 3:
  578. token = token[:-1] # ya' nisbiya
  579. break
  580. return token
  581. def __Suffix_All_alef_maqsura(self, token):
  582. for suffix in self.__suffix_all_alef_maqsura:
  583. if token.endswith(suffix):
  584. token = suffix_replace(token, suffix, "\u064a")
  585. return token
  586. def __Prefix_Step1(self, token):
  587. for prefix in self.__prefix_step1:
  588. if token.startswith(prefix) and len(token) > 3:
  589. if prefix == "\u0623\u0623":
  590. token = prefix_replace(token, prefix, "\u0623")
  591. break
  592. elif prefix == "\u0623\u0622":
  593. token = prefix_replace(token, prefix, "\u0622")
  594. break
  595. elif prefix == "\u0623\u0624":
  596. token = prefix_replace(token, prefix, "\u0624")
  597. break
  598. elif prefix == "\u0623\u0627":
  599. token = prefix_replace(token, prefix, "\u0627")
  600. break
  601. elif prefix == "\u0623\u0625":
  602. token = prefix_replace(token, prefix, "\u0625")
  603. break
  604. return token
  605. def __Prefix_Step2a(self, token):
  606. for prefix in self.__prefix_step2a:
  607. if token.startswith(prefix) and len(token) > 5:
  608. token = token[len(prefix) :]
  609. self.prefix_step2a_success = True
  610. break
  611. return token
  612. def __Prefix_Step2b(self, token):
  613. for prefix in self.__prefix_step2b:
  614. if token.startswith(prefix) and len(token) > 3:
  615. if token[:2] not in self.__prefixes1:
  616. token = token[len(prefix) :]
  617. break
  618. return token
  619. def __Prefix_Step3a_Noun(self, token):
  620. for prefix in self.__prefix_step3a_noun:
  621. if token.startswith(prefix):
  622. if prefix in self.__articles_2len and len(token) > 4:
  623. token = token[len(prefix) :]
  624. self.prefix_step3a_noun_success = True
  625. break
  626. if prefix in self.__articles_3len and len(token) > 5:
  627. token = token[len(prefix) :]
  628. break
  629. return token
  630. def __Prefix_Step3b_Noun(self, token):
  631. for prefix in self.__prefix_step3b_noun:
  632. if token.startswith(prefix):
  633. if len(token) > 3:
  634. if prefix == "\u0628":
  635. token = token[len(prefix) :]
  636. self.prefix_step3b_noun_success = True
  637. break
  638. if prefix in self.__prepositions2:
  639. token = prefix_replace(token, prefix, prefix[1])
  640. self.prefix_step3b_noun_success = True
  641. break
  642. if prefix in self.__prepositions1 and len(token) > 4:
  643. token = token[len(prefix) :] # BUG: cause confusion
  644. self.prefix_step3b_noun_success = True
  645. break
  646. return token
  647. def __Prefix_Step3_Verb(self, token):
  648. for prefix in self.__prefix_step3_verb:
  649. if token.startswith(prefix) and len(token) > 4:
  650. token = prefix_replace(token, prefix, prefix[1])
  651. break
  652. return token
  653. def __Prefix_Step4_Verb(self, token):
  654. for prefix in self.__prefix_step4_verb:
  655. if token.startswith(prefix) and len(token) > 4:
  656. token = prefix_replace(token, prefix, "\u0627\u0633\u062a")
  657. self.is_verb = True
  658. self.is_noun = False
  659. break
  660. return token
  661. def stem(self, word):
  662. """
  663. Stem an Arabic word and return the stemmed form.
  664. :param word: string
  665. :return: string
  666. """
  667. # set initial values
  668. self.is_verb = True
  669. self.is_noun = True
  670. self.is_defined = False
  671. self.suffix_verb_step2a_success = False
  672. self.suffix_verb_step2b_success = False
  673. self.suffix_noun_step2c2_success = False
  674. self.suffix_noun_step1a_success = False
  675. self.suffix_noun_step2a_success = False
  676. self.suffix_noun_step2b_success = False
  677. self.suffixe_noun_step1b_success = False
  678. self.prefix_step2a_success = False
  679. self.prefix_step3a_noun_success = False
  680. self.prefix_step3b_noun_success = False
  681. modified_word = word
  682. # guess type and properties
  683. # checks1
  684. self.__checks_1(modified_word)
  685. # checks2
  686. self.__checks_2(modified_word)
  687. # Pre_Normalization
  688. modified_word = self.__normalize_pre(modified_word)
  689. # Avoid stopwords
  690. if modified_word in self.stopwords or len(modified_word) <= 2:
  691. return modified_word
  692. # Start stemming
  693. if self.is_verb:
  694. modified_word = self.__Suffix_Verb_Step1(modified_word)
  695. if self.suffixes_verb_step1_success:
  696. modified_word = self.__Suffix_Verb_Step2a(modified_word)
  697. if not self.suffix_verb_step2a_success:
  698. modified_word = self.__Suffix_Verb_Step2c(modified_word)
  699. # or next TODO: How to deal with or next instruction
  700. else:
  701. modified_word = self.__Suffix_Verb_Step2b(modified_word)
  702. if not self.suffix_verb_step2b_success:
  703. modified_word = self.__Suffix_Verb_Step2a(modified_word)
  704. if self.is_noun:
  705. modified_word = self.__Suffix_Noun_Step2c2(modified_word)
  706. if not self.suffix_noun_step2c2_success:
  707. if not self.is_defined:
  708. modified_word = self.__Suffix_Noun_Step1a(modified_word)
  709. # if self.suffix_noun_step1a_success:
  710. modified_word = self.__Suffix_Noun_Step2a(modified_word)
  711. if not self.suffix_noun_step2a_success:
  712. modified_word = self.__Suffix_Noun_Step2b(modified_word)
  713. if (
  714. not self.suffix_noun_step2b_success
  715. and not self.suffix_noun_step2a_success
  716. ):
  717. modified_word = self.__Suffix_Noun_Step2c1(modified_word)
  718. # or next ? todo : how to deal with or next
  719. else:
  720. modified_word = self.__Suffix_Noun_Step1b(modified_word)
  721. if self.suffixe_noun_step1b_success:
  722. modified_word = self.__Suffix_Noun_Step2a(modified_word)
  723. if not self.suffix_noun_step2a_success:
  724. modified_word = self.__Suffix_Noun_Step2b(modified_word)
  725. if (
  726. not self.suffix_noun_step2b_success
  727. and not self.suffix_noun_step2a_success
  728. ):
  729. modified_word = self.__Suffix_Noun_Step2c1(modified_word)
  730. else:
  731. if not self.is_defined:
  732. modified_word = self.__Suffix_Noun_Step2a(modified_word)
  733. modified_word = self.__Suffix_Noun_Step2b(modified_word)
  734. modified_word = self.__Suffix_Noun_Step3(modified_word)
  735. if not self.is_noun and self.is_verb:
  736. modified_word = self.__Suffix_All_alef_maqsura(modified_word)
  737. # prefixes
  738. modified_word = self.__Prefix_Step1(modified_word)
  739. modified_word = self.__Prefix_Step2a(modified_word)
  740. if not self.prefix_step2a_success:
  741. modified_word = self.__Prefix_Step2b(modified_word)
  742. modified_word = self.__Prefix_Step3a_Noun(modified_word)
  743. if not self.prefix_step3a_noun_success and self.is_noun:
  744. modified_word = self.__Prefix_Step3b_Noun(modified_word)
  745. else:
  746. if not self.prefix_step3b_noun_success and self.is_verb:
  747. modified_word = self.__Prefix_Step3_Verb(modified_word)
  748. modified_word = self.__Prefix_Step4_Verb(modified_word)
  749. # post normalization stemming
  750. modified_word = self.__normalize_post(modified_word)
  751. stemmed_word = modified_word
  752. return stemmed_word
  753. class DanishStemmer(_ScandinavianStemmer):
  754. """
  755. The Danish Snowball stemmer.
  756. :cvar __vowels: The Danish vowels.
  757. :type __vowels: unicode
  758. :cvar __consonants: The Danish consonants.
  759. :type __consonants: unicode
  760. :cvar __double_consonants: The Danish double consonants.
  761. :type __double_consonants: tuple
  762. :cvar __s_ending: Letters that may directly appear before a word final 's'.
  763. :type __s_ending: unicode
  764. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  765. :type __step1_suffixes: tuple
  766. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  767. :type __step2_suffixes: tuple
  768. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  769. :type __step3_suffixes: tuple
  770. :note: A detailed description of the Danish
  771. stemming algorithm can be found under
  772. http://snowball.tartarus.org/algorithms/danish/stemmer.html
  773. """
  774. # The language's vowels and other important characters are defined.
  775. __vowels = "aeiouy\xE6\xE5\xF8"
  776. __consonants = "bcdfghjklmnpqrstvwxz"
  777. __double_consonants = (
  778. "bb",
  779. "cc",
  780. "dd",
  781. "ff",
  782. "gg",
  783. "hh",
  784. "jj",
  785. "kk",
  786. "ll",
  787. "mm",
  788. "nn",
  789. "pp",
  790. "qq",
  791. "rr",
  792. "ss",
  793. "tt",
  794. "vv",
  795. "ww",
  796. "xx",
  797. "zz",
  798. )
  799. __s_ending = "abcdfghjklmnoprtvyz\xE5"
  800. # The different suffixes, divided into the algorithm's steps
  801. # and organized by length, are listed in tuples.
  802. __step1_suffixes = (
  803. "erendes",
  804. "erende",
  805. "hedens",
  806. "ethed",
  807. "erede",
  808. "heden",
  809. "heder",
  810. "endes",
  811. "ernes",
  812. "erens",
  813. "erets",
  814. "ered",
  815. "ende",
  816. "erne",
  817. "eren",
  818. "erer",
  819. "heds",
  820. "enes",
  821. "eres",
  822. "eret",
  823. "hed",
  824. "ene",
  825. "ere",
  826. "ens",
  827. "ers",
  828. "ets",
  829. "en",
  830. "er",
  831. "es",
  832. "et",
  833. "e",
  834. "s",
  835. )
  836. __step2_suffixes = ("gd", "dt", "gt", "kt")
  837. __step3_suffixes = ("elig", "l\xF8st", "lig", "els", "ig")
  838. def stem(self, word):
  839. """
  840. Stem a Danish word and return the stemmed form.
  841. :param word: The word that is stemmed.
  842. :type word: str or unicode
  843. :return: The stemmed form.
  844. :rtype: unicode
  845. """
  846. # Every word is put into lower case for normalization.
  847. word = word.lower()
  848. if word in self.stopwords:
  849. return word
  850. # After this, the required regions are generated
  851. # by the respective helper method.
  852. r1 = self._r1_scandinavian(word, self.__vowels)
  853. # Then the actual stemming process starts.
  854. # Every new step is explicitly indicated
  855. # according to the descriptions on the Snowball website.
  856. # STEP 1
  857. for suffix in self.__step1_suffixes:
  858. if r1.endswith(suffix):
  859. if suffix == "s":
  860. if word[-2] in self.__s_ending:
  861. word = word[:-1]
  862. r1 = r1[:-1]
  863. else:
  864. word = word[: -len(suffix)]
  865. r1 = r1[: -len(suffix)]
  866. break
  867. # STEP 2
  868. for suffix in self.__step2_suffixes:
  869. if r1.endswith(suffix):
  870. word = word[:-1]
  871. r1 = r1[:-1]
  872. break
  873. # STEP 3
  874. if r1.endswith("igst"):
  875. word = word[:-2]
  876. r1 = r1[:-2]
  877. for suffix in self.__step3_suffixes:
  878. if r1.endswith(suffix):
  879. if suffix == "l\xF8st":
  880. word = word[:-1]
  881. r1 = r1[:-1]
  882. else:
  883. word = word[: -len(suffix)]
  884. r1 = r1[: -len(suffix)]
  885. if r1.endswith(self.__step2_suffixes):
  886. word = word[:-1]
  887. r1 = r1[:-1]
  888. break
  889. # STEP 4: Undouble
  890. for double_cons in self.__double_consonants:
  891. if word.endswith(double_cons) and len(word) > 3:
  892. word = word[:-1]
  893. break
  894. return word
  895. class DutchStemmer(_StandardStemmer):
  896. """
  897. The Dutch Snowball stemmer.
  898. :cvar __vowels: The Dutch vowels.
  899. :type __vowels: unicode
  900. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  901. :type __step1_suffixes: tuple
  902. :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm.
  903. :type __step3b_suffixes: tuple
  904. :note: A detailed description of the Dutch
  905. stemming algorithm can be found under
  906. http://snowball.tartarus.org/algorithms/dutch/stemmer.html
  907. """
  908. __vowels = "aeiouy\xE8"
  909. __step1_suffixes = ("heden", "ene", "en", "se", "s")
  910. __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig")
  911. def stem(self, word):
  912. """
  913. Stem a Dutch word and return the stemmed form.
  914. :param word: The word that is stemmed.
  915. :type word: str or unicode
  916. :return: The stemmed form.
  917. :rtype: unicode
  918. """
  919. word = word.lower()
  920. if word in self.stopwords:
  921. return word
  922. step2_success = False
  923. # Vowel accents are removed.
  924. word = (
  925. word.replace("\xE4", "a")
  926. .replace("\xE1", "a")
  927. .replace("\xEB", "e")
  928. .replace("\xE9", "e")
  929. .replace("\xED", "i")
  930. .replace("\xEF", "i")
  931. .replace("\xF6", "o")
  932. .replace("\xF3", "o")
  933. .replace("\xFC", "u")
  934. .replace("\xFA", "u")
  935. )
  936. # An initial 'y', a 'y' after a vowel,
  937. # and an 'i' between self.__vowels is put into upper case.
  938. # As from now these are treated as consonants.
  939. if word.startswith("y"):
  940. word = "".join(("Y", word[1:]))
  941. for i in range(1, len(word)):
  942. if word[i - 1] in self.__vowels and word[i] == "y":
  943. word = "".join((word[:i], "Y", word[i + 1 :]))
  944. for i in range(1, len(word) - 1):
  945. if (
  946. word[i - 1] in self.__vowels
  947. and word[i] == "i"
  948. and word[i + 1] in self.__vowels
  949. ):
  950. word = "".join((word[:i], "I", word[i + 1 :]))
  951. r1, r2 = self._r1r2_standard(word, self.__vowels)
  952. # R1 is adjusted so that the region before it
  953. # contains at least 3 letters.
  954. for i in range(1, len(word)):
  955. if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
  956. if 3 > len(word[: i + 1]) > 0:
  957. r1 = word[3:]
  958. elif len(word[: i + 1]) == 0:
  959. return word
  960. break
  961. # STEP 1
  962. for suffix in self.__step1_suffixes:
  963. if r1.endswith(suffix):
  964. if suffix == "heden":
  965. word = suffix_replace(word, suffix, "heid")
  966. r1 = suffix_replace(r1, suffix, "heid")
  967. if r2.endswith("heden"):
  968. r2 = suffix_replace(r2, suffix, "heid")
  969. elif (
  970. suffix in ("ene", "en")
  971. and not word.endswith("heden")
  972. and word[-len(suffix) - 1] not in self.__vowels
  973. and word[-len(suffix) - 3 : -len(suffix)] != "gem"
  974. ):
  975. word = word[: -len(suffix)]
  976. r1 = r1[: -len(suffix)]
  977. r2 = r2[: -len(suffix)]
  978. if word.endswith(("kk", "dd", "tt")):
  979. word = word[:-1]
  980. r1 = r1[:-1]
  981. r2 = r2[:-1]
  982. elif (
  983. suffix in ("se", "s")
  984. and word[-len(suffix) - 1] not in self.__vowels
  985. and word[-len(suffix) - 1] != "j"
  986. ):
  987. word = word[: -len(suffix)]
  988. r1 = r1[: -len(suffix)]
  989. r2 = r2[: -len(suffix)]
  990. break
  991. # STEP 2
  992. if r1.endswith("e") and word[-2] not in self.__vowels:
  993. step2_success = True
  994. word = word[:-1]
  995. r1 = r1[:-1]
  996. r2 = r2[:-1]
  997. if word.endswith(("kk", "dd", "tt")):
  998. word = word[:-1]
  999. r1 = r1[:-1]
  1000. r2 = r2[:-1]
  1001. # STEP 3a
  1002. if r2.endswith("heid") and word[-5] != "c":
  1003. word = word[:-4]
  1004. r1 = r1[:-4]
  1005. r2 = r2[:-4]
  1006. if (
  1007. r1.endswith("en")
  1008. and word[-3] not in self.__vowels
  1009. and word[-5:-2] != "gem"
  1010. ):
  1011. word = word[:-2]
  1012. r1 = r1[:-2]
  1013. r2 = r2[:-2]
  1014. if word.endswith(("kk", "dd", "tt")):
  1015. word = word[:-1]
  1016. r1 = r1[:-1]
  1017. r2 = r2[:-1]
  1018. # STEP 3b: Derivational suffixes
  1019. for suffix in self.__step3b_suffixes:
  1020. if r2.endswith(suffix):
  1021. if suffix in ("end", "ing"):
  1022. word = word[:-3]
  1023. r2 = r2[:-3]
  1024. if r2.endswith("ig") and word[-3] != "e":
  1025. word = word[:-2]
  1026. else:
  1027. if word.endswith(("kk", "dd", "tt")):
  1028. word = word[:-1]
  1029. elif suffix == "ig" and word[-3] != "e":
  1030. word = word[:-2]
  1031. elif suffix == "lijk":
  1032. word = word[:-4]
  1033. r1 = r1[:-4]
  1034. if r1.endswith("e") and word[-2] not in self.__vowels:
  1035. word = word[:-1]
  1036. if word.endswith(("kk", "dd", "tt")):
  1037. word = word[:-1]
  1038. elif suffix == "baar":
  1039. word = word[:-4]
  1040. elif suffix == "bar" and step2_success:
  1041. word = word[:-3]
  1042. break
  1043. # STEP 4: Undouble vowel
  1044. if len(word) >= 4:
  1045. if word[-1] not in self.__vowels and word[-1] != "I":
  1046. if word[-3:-1] in ("aa", "ee", "oo", "uu"):
  1047. if word[-4] not in self.__vowels:
  1048. word = "".join((word[:-3], word[-3], word[-1]))
  1049. # All occurrences of 'I' and 'Y' are put back into lower case.
  1050. word = word.replace("I", "i").replace("Y", "y")
  1051. return word
  1052. class EnglishStemmer(_StandardStemmer):
  1053. """
  1054. The English Snowball stemmer.
  1055. :cvar __vowels: The English vowels.
  1056. :type __vowels: unicode
  1057. :cvar __double_consonants: The English double consonants.
  1058. :type __double_consonants: tuple
  1059. :cvar __li_ending: Letters that may directly appear before a word final 'li'.
  1060. :type __li_ending: unicode
  1061. :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
  1062. :type __step0_suffixes: tuple
  1063. :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm.
  1064. :type __step1a_suffixes: tuple
  1065. :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm.
  1066. :type __step1b_suffixes: tuple
  1067. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  1068. :type __step2_suffixes: tuple
  1069. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  1070. :type __step3_suffixes: tuple
  1071. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  1072. :type __step4_suffixes: tuple
  1073. :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
  1074. :type __step5_suffixes: tuple
  1075. :cvar __special_words: A dictionary containing words
  1076. which have to be stemmed specially.
  1077. :type __special_words: dict
  1078. :note: A detailed description of the English
  1079. stemming algorithm can be found under
  1080. http://snowball.tartarus.org/algorithms/english/stemmer.html
  1081. """
  1082. __vowels = "aeiouy"
  1083. __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
  1084. __li_ending = "cdeghkmnrt"
  1085. __step0_suffixes = ("'s'", "'s", "'")
  1086. __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s")
  1087. __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed")
  1088. __step2_suffixes = (
  1089. "ization",
  1090. "ational",
  1091. "fulness",
  1092. "ousness",
  1093. "iveness",
  1094. "tional",
  1095. "biliti",
  1096. "lessli",
  1097. "entli",
  1098. "ation",
  1099. "alism",
  1100. "aliti",
  1101. "ousli",
  1102. "iviti",
  1103. "fulli",
  1104. "enci",
  1105. "anci",
  1106. "abli",
  1107. "izer",
  1108. "ator",
  1109. "alli",
  1110. "bli",
  1111. "ogi",
  1112. "li",
  1113. )
  1114. __step3_suffixes = (
  1115. "ational",
  1116. "tional",
  1117. "alize",
  1118. "icate",
  1119. "iciti",
  1120. "ative",
  1121. "ical",
  1122. "ness",
  1123. "ful",
  1124. )
  1125. __step4_suffixes = (
  1126. "ement",
  1127. "ance",
  1128. "ence",
  1129. "able",
  1130. "ible",
  1131. "ment",
  1132. "ant",
  1133. "ent",
  1134. "ism",
  1135. "ate",
  1136. "iti",
  1137. "ous",
  1138. "ive",
  1139. "ize",
  1140. "ion",
  1141. "al",
  1142. "er",
  1143. "ic",
  1144. )
  1145. __step5_suffixes = ("e", "l")
  1146. __special_words = {
  1147. "skis": "ski",
  1148. "skies": "sky",
  1149. "dying": "die",
  1150. "lying": "lie",
  1151. "tying": "tie",
  1152. "idly": "idl",
  1153. "gently": "gentl",
  1154. "ugly": "ugli",
  1155. "early": "earli",
  1156. "only": "onli",
  1157. "singly": "singl",
  1158. "sky": "sky",
  1159. "news": "news",
  1160. "howe": "howe",
  1161. "atlas": "atlas",
  1162. "cosmos": "cosmos",
  1163. "bias": "bias",
  1164. "andes": "andes",
  1165. "inning": "inning",
  1166. "innings": "inning",
  1167. "outing": "outing",
  1168. "outings": "outing",
  1169. "canning": "canning",
  1170. "cannings": "canning",
  1171. "herring": "herring",
  1172. "herrings": "herring",
  1173. "earring": "earring",
  1174. "earrings": "earring",
  1175. "proceed": "proceed",
  1176. "proceeds": "proceed",
  1177. "proceeded": "proceed",
  1178. "proceeding": "proceed",
  1179. "exceed": "exceed",
  1180. "exceeds": "exceed",
  1181. "exceeded": "exceed",
  1182. "exceeding": "exceed",
  1183. "succeed": "succeed",
  1184. "succeeds": "succeed",
  1185. "succeeded": "succeed",
  1186. "succeeding": "succeed",
  1187. }
  1188. def stem(self, word):
  1189. """
  1190. Stem an English word and return the stemmed form.
  1191. :param word: The word that is stemmed.
  1192. :type word: str or unicode
  1193. :return: The stemmed form.
  1194. :rtype: unicode
  1195. """
  1196. word = word.lower()
  1197. if word in self.stopwords or len(word) <= 2:
  1198. return word
  1199. elif word in self.__special_words:
  1200. return self.__special_words[word]
  1201. # Map the different apostrophe characters to a single consistent one
  1202. word = (
  1203. word.replace("\u2019", "\x27")
  1204. .replace("\u2018", "\x27")
  1205. .replace("\u201B", "\x27")
  1206. )
  1207. if word.startswith("\x27"):
  1208. word = word[1:]
  1209. if word.startswith("y"):
  1210. word = "".join(("Y", word[1:]))
  1211. for i in range(1, len(word)):
  1212. if word[i - 1] in self.__vowels and word[i] == "y":
  1213. word = "".join((word[:i], "Y", word[i + 1 :]))
  1214. step1a_vowel_found = False
  1215. step1b_vowel_found = False
  1216. r1 = ""
  1217. r2 = ""
  1218. if word.startswith(("gener", "commun", "arsen")):
  1219. if word.startswith(("gener", "arsen")):
  1220. r1 = word[5:]
  1221. else:
  1222. r1 = word[6:]
  1223. for i in range(1, len(r1)):
  1224. if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels:
  1225. r2 = r1[i + 1 :]
  1226. break
  1227. else:
  1228. r1, r2 = self._r1r2_standard(word, self.__vowels)
  1229. # STEP 0
  1230. for suffix in self.__step0_suffixes:
  1231. if word.endswith(suffix):
  1232. word = word[: -len(suffix)]
  1233. r1 = r1[: -len(suffix)]
  1234. r2 = r2[: -len(suffix)]
  1235. break
  1236. # STEP 1a
  1237. for suffix in self.__step1a_suffixes:
  1238. if word.endswith(suffix):
  1239. if suffix == "sses":
  1240. word = word[:-2]
  1241. r1 = r1[:-2]
  1242. r2 = r2[:-2]
  1243. elif suffix in ("ied", "ies"):
  1244. if len(word[: -len(suffix)]) > 1:
  1245. word = word[:-2]
  1246. r1 = r1[:-2]
  1247. r2 = r2[:-2]
  1248. else:
  1249. word = word[:-1]
  1250. r1 = r1[:-1]
  1251. r2 = r2[:-1]
  1252. elif suffix == "s":
  1253. for letter in word[:-2]:
  1254. if letter in self.__vowels:
  1255. step1a_vowel_found = True
  1256. break
  1257. if step1a_vowel_found:
  1258. word = word[:-1]
  1259. r1 = r1[:-1]
  1260. r2 = r2[:-1]
  1261. break
  1262. # STEP 1b
  1263. for suffix in self.__step1b_suffixes:
  1264. if word.endswith(suffix):
  1265. if suffix in ("eed", "eedly"):
  1266. if r1.endswith(suffix):
  1267. word = suffix_replace(word, suffix, "ee")
  1268. if len(r1) >= len(suffix):
  1269. r1 = suffix_replace(r1, suffix, "ee")
  1270. else:
  1271. r1 = ""
  1272. if len(r2) >= len(suffix):
  1273. r2 = suffix_replace(r2, suffix, "ee")
  1274. else:
  1275. r2 = ""
  1276. else:
  1277. for letter in word[: -len(suffix)]:
  1278. if letter in self.__vowels:
  1279. step1b_vowel_found = True
  1280. break
  1281. if step1b_vowel_found:
  1282. word = word[: -len(suffix)]
  1283. r1 = r1[: -len(suffix)]
  1284. r2 = r2[: -len(suffix)]
  1285. if word.endswith(("at", "bl", "iz")):
  1286. word = "".join((word, "e"))
  1287. r1 = "".join((r1, "e"))
  1288. if len(word) > 5 or len(r1) >= 3:
  1289. r2 = "".join((r2, "e"))
  1290. elif word.endswith(self.__double_consonants):
  1291. word = word[:-1]
  1292. r1 = r1[:-1]
  1293. r2 = r2[:-1]
  1294. elif (
  1295. r1 == ""
  1296. and len(word) >= 3
  1297. and word[-1] not in self.__vowels
  1298. and word[-1] not in "wxY"
  1299. and word[-2] in self.__vowels
  1300. and word[-3] not in self.__vowels
  1301. ) or (
  1302. r1 == ""
  1303. and len(word) == 2
  1304. and word[0] in self.__vowels
  1305. and word[1] not in self.__vowels
  1306. ):
  1307. word = "".join((word, "e"))
  1308. if len(r1) > 0:
  1309. r1 = "".join((r1, "e"))
  1310. if len(r2) > 0:
  1311. r2 = "".join((r2, "e"))
  1312. break
  1313. # STEP 1c
  1314. if len(word) > 2 and word[-1] in "yY" and word[-2] not in self.__vowels:
  1315. word = "".join((word[:-1], "i"))
  1316. if len(r1) >= 1:
  1317. r1 = "".join((r1[:-1], "i"))
  1318. else:
  1319. r1 = ""
  1320. if len(r2) >= 1:
  1321. r2 = "".join((r2[:-1], "i"))
  1322. else:
  1323. r2 = ""
  1324. # STEP 2
  1325. for suffix in self.__step2_suffixes:
  1326. if word.endswith(suffix):
  1327. if r1.endswith(suffix):
  1328. if suffix == "tional":
  1329. word = word[:-2]
  1330. r1 = r1[:-2]
  1331. r2 = r2[:-2]
  1332. elif suffix in ("enci", "anci", "abli"):
  1333. word = "".join((word[:-1], "e"))
  1334. if len(r1) >= 1:
  1335. r1 = "".join((r1[:-1], "e"))
  1336. else:
  1337. r1 = ""
  1338. if len(r2) >= 1:
  1339. r2 = "".join((r2[:-1], "e"))
  1340. else:
  1341. r2 = ""
  1342. elif suffix == "entli":
  1343. word = word[:-2]
  1344. r1 = r1[:-2]
  1345. r2 = r2[:-2]
  1346. elif suffix in ("izer", "ization"):
  1347. word = suffix_replace(word, suffix, "ize")
  1348. if len(r1) >= len(suffix):
  1349. r1 = suffix_replace(r1, suffix, "ize")
  1350. else:
  1351. r1 = ""
  1352. if len(r2) >= len(suffix):
  1353. r2 = suffix_replace(r2, suffix, "ize")
  1354. else:
  1355. r2 = ""
  1356. elif suffix in ("ational", "ation", "ator"):
  1357. word = suffix_replace(word, suffix, "ate")
  1358. if len(r1) >= len(suffix):
  1359. r1 = suffix_replace(r1, suffix, "ate")
  1360. else:
  1361. r1 = ""
  1362. if len(r2) >= len(suffix):
  1363. r2 = suffix_replace(r2, suffix, "ate")
  1364. else:
  1365. r2 = "e"
  1366. elif suffix in ("alism", "aliti", "alli"):
  1367. word = suffix_replace(word, suffix, "al")
  1368. if len(r1) >= len(suffix):
  1369. r1 = suffix_replace(r1, suffix, "al")
  1370. else:
  1371. r1 = ""
  1372. if len(r2) >= len(suffix):
  1373. r2 = suffix_replace(r2, suffix, "al")
  1374. else:
  1375. r2 = ""
  1376. elif suffix == "fulness":
  1377. word = word[:-4]
  1378. r1 = r1[:-4]
  1379. r2 = r2[:-4]
  1380. elif suffix in ("ousli", "ousness"):
  1381. word = suffix_replace(word, suffix, "ous")
  1382. if len(r1) >= len(suffix):
  1383. r1 = suffix_replace(r1, suffix, "ous")
  1384. else:
  1385. r1 = ""
  1386. if len(r2) >= len(suffix):
  1387. r2 = suffix_replace(r2, suffix, "ous")
  1388. else:
  1389. r2 = ""
  1390. elif suffix in ("iveness", "iviti"):
  1391. word = suffix_replace(word, suffix, "ive")
  1392. if len(r1) >= len(suffix):
  1393. r1 = suffix_replace(r1, suffix, "ive")
  1394. else:
  1395. r1 = ""
  1396. if len(r2) >= len(suffix):
  1397. r2 = suffix_replace(r2, suffix, "ive")
  1398. else:
  1399. r2 = "e"
  1400. elif suffix in ("biliti", "bli"):
  1401. word = suffix_replace(word, suffix, "ble")
  1402. if len(r1) >= len(suffix):
  1403. r1 = suffix_replace(r1, suffix, "ble")
  1404. else:
  1405. r1 = ""
  1406. if len(r2) >= len(suffix):
  1407. r2 = suffix_replace(r2, suffix, "ble")
  1408. else:
  1409. r2 = ""
  1410. elif suffix == "ogi" and word[-4] == "l":
  1411. word = word[:-1]
  1412. r1 = r1[:-1]
  1413. r2 = r2[:-1]
  1414. elif suffix in ("fulli", "lessli"):
  1415. word = word[:-2]
  1416. r1 = r1[:-2]
  1417. r2 = r2[:-2]
  1418. elif suffix == "li" and word[-3] in self.__li_ending:
  1419. word = word[:-2]
  1420. r1 = r1[:-2]
  1421. r2 = r2[:-2]
  1422. break
  1423. # STEP 3
  1424. for suffix in self.__step3_suffixes:
  1425. if word.endswith(suffix):
  1426. if r1.endswith(suffix):
  1427. if suffix == "tional":
  1428. word = word[:-2]
  1429. r1 = r1[:-2]
  1430. r2 = r2[:-2]
  1431. elif suffix == "ational":
  1432. word = suffix_replace(word, suffix, "ate")
  1433. if len(r1) >= len(suffix):
  1434. r1 = suffix_replace(r1, suffix, "ate")
  1435. else:
  1436. r1 = ""
  1437. if len(r2) >= len(suffix):
  1438. r2 = suffix_replace(r2, suffix, "ate")
  1439. else:
  1440. r2 = ""
  1441. elif suffix == "alize":
  1442. word = word[:-3]
  1443. r1 = r1[:-3]
  1444. r2 = r2[:-3]
  1445. elif suffix in ("icate", "iciti", "ical"):
  1446. word = suffix_replace(word, suffix, "ic")
  1447. if len(r1) >= len(suffix):
  1448. r1 = suffix_replace(r1, suffix, "ic")
  1449. else:
  1450. r1 = ""
  1451. if len(r2) >= len(suffix):
  1452. r2 = suffix_replace(r2, suffix, "ic")
  1453. else:
  1454. r2 = ""
  1455. elif suffix in ("ful", "ness"):
  1456. word = word[: -len(suffix)]
  1457. r1 = r1[: -len(suffix)]
  1458. r2 = r2[: -len(suffix)]
  1459. elif suffix == "ative" and r2.endswith(suffix):
  1460. word = word[:-5]
  1461. r1 = r1[:-5]
  1462. r2 = r2[:-5]
  1463. break
  1464. # STEP 4
  1465. for suffix in self.__step4_suffixes:
  1466. if word.endswith(suffix):
  1467. if r2.endswith(suffix):
  1468. if suffix == "ion":
  1469. if word[-4] in "st":
  1470. word = word[:-3]
  1471. r1 = r1[:-3]
  1472. r2 = r2[:-3]
  1473. else:
  1474. word = word[: -len(suffix)]
  1475. r1 = r1[: -len(suffix)]
  1476. r2 = r2[: -len(suffix)]
  1477. break
  1478. # STEP 5
  1479. if r2.endswith("l") and word[-2] == "l":
  1480. word = word[:-1]
  1481. elif r2.endswith("e"):
  1482. word = word[:-1]
  1483. elif r1.endswith("e"):
  1484. if len(word) >= 4 and (
  1485. word[-2] in self.__vowels
  1486. or word[-2] in "wxY"
  1487. or word[-3] not in self.__vowels
  1488. or word[-4] in self.__vowels
  1489. ):
  1490. word = word[:-1]
  1491. word = word.replace("Y", "y")
  1492. return word
  1493. class FinnishStemmer(_StandardStemmer):
  1494. """
  1495. The Finnish Snowball stemmer.
  1496. :cvar __vowels: The Finnish vowels.
  1497. :type __vowels: unicode
  1498. :cvar __restricted_vowels: A subset of the Finnish vowels.
  1499. :type __restricted_vowels: unicode
  1500. :cvar __long_vowels: The Finnish vowels in their long forms.
  1501. :type __long_vowels: tuple
  1502. :cvar __consonants: The Finnish consonants.
  1503. :type __consonants: unicode
  1504. :cvar __double_consonants: The Finnish double consonants.
  1505. :type __double_consonants: tuple
  1506. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  1507. :type __step1_suffixes: tuple
  1508. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  1509. :type __step2_suffixes: tuple
  1510. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  1511. :type __step3_suffixes: tuple
  1512. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  1513. :type __step4_suffixes: tuple
  1514. :note: A detailed description of the Finnish
  1515. stemming algorithm can be found under
  1516. http://snowball.tartarus.org/algorithms/finnish/stemmer.html
  1517. """
  1518. __vowels = "aeiouy\xE4\xF6"
  1519. __restricted_vowels = "aeiou\xE4\xF6"
  1520. __long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4", "\xF6\xF6")
  1521. __consonants = "bcdfghjklmnpqrstvwxz"
  1522. __double_consonants = (
  1523. "bb",
  1524. "cc",
  1525. "dd",
  1526. "ff",
  1527. "gg",
  1528. "hh",
  1529. "jj",
  1530. "kk",
  1531. "ll",
  1532. "mm",
  1533. "nn",
  1534. "pp",
  1535. "qq",
  1536. "rr",
  1537. "ss",
  1538. "tt",
  1539. "vv",
  1540. "ww",
  1541. "xx",
  1542. "zz",
  1543. )
  1544. __step1_suffixes = (
  1545. "kaan",
  1546. "k\xE4\xE4n",
  1547. "sti",
  1548. "kin",
  1549. "han",
  1550. "h\xE4n",
  1551. "ko",
  1552. "k\xF6",
  1553. "pa",
  1554. "p\xE4",
  1555. )
  1556. __step2_suffixes = ("nsa", "ns\xE4", "mme", "nne", "si", "ni", "an", "\xE4n", "en")
  1557. __step3_suffixes = (
  1558. "siin",
  1559. "tten",
  1560. "seen",
  1561. "han",
  1562. "hen",
  1563. "hin",
  1564. "hon",
  1565. "h\xE4n",
  1566. "h\xF6n",
  1567. "den",
  1568. "tta",
  1569. "tt\xE4",
  1570. "ssa",
  1571. "ss\xE4",
  1572. "sta",
  1573. "st\xE4",
  1574. "lla",
  1575. "ll\xE4",
  1576. "lta",
  1577. "lt\xE4",
  1578. "lle",
  1579. "ksi",
  1580. "ine",
  1581. "ta",
  1582. "t\xE4",
  1583. "na",
  1584. "n\xE4",
  1585. "a",
  1586. "\xE4",
  1587. "n",
  1588. )
  1589. __step4_suffixes = (
  1590. "impi",
  1591. "impa",
  1592. "imp\xE4",
  1593. "immi",
  1594. "imma",
  1595. "imm\xE4",
  1596. "mpi",
  1597. "mpa",
  1598. "mp\xE4",
  1599. "mmi",
  1600. "mma",
  1601. "mm\xE4",
  1602. "eja",
  1603. "ej\xE4",
  1604. )
  1605. def stem(self, word):
  1606. """
  1607. Stem a Finnish word and return the stemmed form.
  1608. :param word: The word that is stemmed.
  1609. :type word: str or unicode
  1610. :return: The stemmed form.
  1611. :rtype: unicode
  1612. """
  1613. word = word.lower()
  1614. if word in self.stopwords:
  1615. return word
  1616. step3_success = False
  1617. r1, r2 = self._r1r2_standard(word, self.__vowels)
  1618. # STEP 1: Particles etc.
  1619. for suffix in self.__step1_suffixes:
  1620. if r1.endswith(suffix):
  1621. if suffix == "sti":
  1622. if suffix in r2:
  1623. word = word[:-3]
  1624. r1 = r1[:-3]
  1625. r2 = r2[:-3]
  1626. else:
  1627. if word[-len(suffix) - 1] in "ntaeiouy\xE4\xF6":
  1628. word = word[: -len(suffix)]
  1629. r1 = r1[: -len(suffix)]
  1630. r2 = r2[: -len(suffix)]
  1631. break
  1632. # STEP 2: Possessives
  1633. for suffix in self.__step2_suffixes:
  1634. if r1.endswith(suffix):
  1635. if suffix == "si":
  1636. if word[-3] != "k":
  1637. word = word[:-2]
  1638. r1 = r1[:-2]
  1639. r2 = r2[:-2]
  1640. elif suffix == "ni":
  1641. word = word[:-2]
  1642. r1 = r1[:-2]
  1643. r2 = r2[:-2]
  1644. if word.endswith("kse"):
  1645. word = suffix_replace(word, "kse", "ksi")
  1646. if r1.endswith("kse"):
  1647. r1 = suffix_replace(r1, "kse", "ksi")
  1648. if r2.endswith("kse"):
  1649. r2 = suffix_replace(r2, "kse", "ksi")
  1650. elif suffix == "an":
  1651. if word[-4:-2] in ("ta", "na") or word[-5:-2] in (
  1652. "ssa",
  1653. "sta",
  1654. "lla",
  1655. "lta",
  1656. ):
  1657. word = word[:-2]
  1658. r1 = r1[:-2]
  1659. r2 = r2[:-2]
  1660. elif suffix == "\xE4n":
  1661. if word[-4:-2] in ("t\xE4", "n\xE4") or word[-5:-2] in (
  1662. "ss\xE4",
  1663. "st\xE4",
  1664. "ll\xE4",
  1665. "lt\xE4",
  1666. ):
  1667. word = word[:-2]
  1668. r1 = r1[:-2]
  1669. r2 = r2[:-2]
  1670. elif suffix == "en":
  1671. if word[-5:-2] in ("lle", "ine"):
  1672. word = word[:-2]
  1673. r1 = r1[:-2]
  1674. r2 = r2[:-2]
  1675. else:
  1676. word = word[:-3]
  1677. r1 = r1[:-3]
  1678. r2 = r2[:-3]
  1679. break
  1680. # STEP 3: Cases
  1681. for suffix in self.__step3_suffixes:
  1682. if r1.endswith(suffix):
  1683. if suffix in ("han", "hen", "hin", "hon", "h\xE4n", "h\xF6n"):
  1684. if (
  1685. (suffix == "han" and word[-4] == "a")
  1686. or (suffix == "hen" and word[-4] == "e")
  1687. or (suffix == "hin" and word[-4] == "i")
  1688. or (suffix == "hon" and word[-4] == "o")
  1689. or (suffix == "h\xE4n" and word[-4] == "\xE4")
  1690. or (suffix == "h\xF6n" and word[-4] == "\xF6")
  1691. ):
  1692. word = word[:-3]
  1693. r1 = r1[:-3]
  1694. r2 = r2[:-3]
  1695. step3_success = True
  1696. elif suffix in ("siin", "den", "tten"):
  1697. if (
  1698. word[-len(suffix) - 1] == "i"
  1699. and word[-len(suffix) - 2] in self.__restricted_vowels
  1700. ):
  1701. word = word[: -len(suffix)]
  1702. r1 = r1[: -len(suffix)]
  1703. r2 = r2[: -len(suffix)]
  1704. step3_success = True
  1705. else:
  1706. continue
  1707. elif suffix == "seen":
  1708. if word[-6:-4] in self.__long_vowels:
  1709. word = word[:-4]
  1710. r1 = r1[:-4]
  1711. r2 = r2[:-4]
  1712. step3_success = True
  1713. else:
  1714. continue
  1715. elif suffix in ("a", "\xE4"):
  1716. if word[-2] in self.__vowels and word[-3] in self.__consonants:
  1717. word = word[:-1]
  1718. r1 = r1[:-1]
  1719. r2 = r2[:-1]
  1720. step3_success = True
  1721. elif suffix in ("tta", "tt\xE4"):
  1722. if word[-4] == "e":
  1723. word = word[:-3]
  1724. r1 = r1[:-3]
  1725. r2 = r2[:-3]
  1726. step3_success = True
  1727. elif suffix == "n":
  1728. word = word[:-1]
  1729. r1 = r1[:-1]
  1730. r2 = r2[:-1]
  1731. step3_success = True
  1732. if word[-2:] == "ie" or word[-2:] in self.__long_vowels:
  1733. word = word[:-1]
  1734. r1 = r1[:-1]
  1735. r2 = r2[:-1]
  1736. else:
  1737. word = word[: -len(suffix)]
  1738. r1 = r1[: -len(suffix)]
  1739. r2 = r2[: -len(suffix)]
  1740. step3_success = True
  1741. break
  1742. # STEP 4: Other endings
  1743. for suffix in self.__step4_suffixes:
  1744. if r2.endswith(suffix):
  1745. if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma", "mm\xE4"):
  1746. if word[-5:-3] != "po":
  1747. word = word[:-3]
  1748. r1 = r1[:-3]
  1749. r2 = r2[:-3]
  1750. else:
  1751. word = word[: -len(suffix)]
  1752. r1 = r1[: -len(suffix)]
  1753. r2 = r2[: -len(suffix)]
  1754. break
  1755. # STEP 5: Plurals
  1756. if step3_success and len(r1) >= 1 and r1[-1] in "ij":
  1757. word = word[:-1]
  1758. r1 = r1[:-1]
  1759. elif (
  1760. not step3_success
  1761. and len(r1) >= 2
  1762. and r1[-1] == "t"
  1763. and r1[-2] in self.__vowels
  1764. ):
  1765. word = word[:-1]
  1766. r1 = r1[:-1]
  1767. r2 = r2[:-1]
  1768. if r2.endswith("imma"):
  1769. word = word[:-4]
  1770. r1 = r1[:-4]
  1771. elif r2.endswith("mma") and r2[-5:-3] != "po":
  1772. word = word[:-3]
  1773. r1 = r1[:-3]
  1774. # STEP 6: Tidying up
  1775. if r1[-2:] in self.__long_vowels:
  1776. word = word[:-1]
  1777. r1 = r1[:-1]
  1778. if len(r1) >= 2 and r1[-2] in self.__consonants and r1[-1] in "a\xE4ei":
  1779. word = word[:-1]
  1780. r1 = r1[:-1]
  1781. if r1.endswith(("oj", "uj")):
  1782. word = word[:-1]
  1783. r1 = r1[:-1]
  1784. if r1.endswith("jo"):
  1785. word = word[:-1]
  1786. r1 = r1[:-1]
  1787. # If the word ends with a double consonant
  1788. # followed by zero or more vowels, the last consonant is removed.
  1789. for i in range(1, len(word)):
  1790. if word[-i] in self.__vowels:
  1791. continue
  1792. else:
  1793. if i == 1:
  1794. if word[-i - 1 :] in self.__double_consonants:
  1795. word = word[:-1]
  1796. else:
  1797. if word[-i - 1 : -i + 1] in self.__double_consonants:
  1798. word = "".join((word[:-i], word[-i + 1 :]))
  1799. break
  1800. return word
  1801. class FrenchStemmer(_StandardStemmer):
  1802. """
  1803. The French Snowball stemmer.
  1804. :cvar __vowels: The French vowels.
  1805. :type __vowels: unicode
  1806. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  1807. :type __step1_suffixes: tuple
  1808. :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
  1809. :type __step2a_suffixes: tuple
  1810. :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
  1811. :type __step2b_suffixes: tuple
  1812. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  1813. :type __step4_suffixes: tuple
  1814. :note: A detailed description of the French
  1815. stemming algorithm can be found under
  1816. http://snowball.tartarus.org/algorithms/french/stemmer.html
  1817. """
  1818. __vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9"
  1819. __step1_suffixes = (
  1820. "issements",
  1821. "issement",
  1822. "atrices",
  1823. "atrice",
  1824. "ateurs",
  1825. "ations",
  1826. "logies",
  1827. "usions",
  1828. "utions",
  1829. "ements",
  1830. "amment",
  1831. "emment",
  1832. "ances",
  1833. "iqUes",
  1834. "ismes",
  1835. "ables",
  1836. "istes",
  1837. "ateur",
  1838. "ation",
  1839. "logie",
  1840. "usion",
  1841. "ution",
  1842. "ences",
  1843. "ement",
  1844. "euses",
  1845. "ments",
  1846. "ance",
  1847. "iqUe",
  1848. "isme",
  1849. "able",
  1850. "iste",
  1851. "ence",
  1852. "it\xE9s",
  1853. "ives",
  1854. "eaux",
  1855. "euse",
  1856. "ment",
  1857. "eux",
  1858. "it\xE9",
  1859. "ive",
  1860. "ifs",
  1861. "aux",
  1862. "if",
  1863. )
  1864. __step2a_suffixes = (
  1865. "issaIent",
  1866. "issantes",
  1867. "iraIent",
  1868. "issante",
  1869. "issants",
  1870. "issions",
  1871. "irions",
  1872. "issais",
  1873. "issait",
  1874. "issant",
  1875. "issent",
  1876. "issiez",
  1877. "issons",
  1878. "irais",
  1879. "irait",
  1880. "irent",
  1881. "iriez",
  1882. "irons",
  1883. "iront",
  1884. "isses",
  1885. "issez",
  1886. "\xEEmes",
  1887. "\xEEtes",
  1888. "irai",
  1889. "iras",
  1890. "irez",
  1891. "isse",
  1892. "ies",
  1893. "ira",
  1894. "\xEEt",
  1895. "ie",
  1896. "ir",
  1897. "is",
  1898. "it",
  1899. "i",
  1900. )
  1901. __step2b_suffixes = (
  1902. "eraIent",
  1903. "assions",
  1904. "erions",
  1905. "assent",
  1906. "assiez",
  1907. "\xE8rent",
  1908. "erais",
  1909. "erait",
  1910. "eriez",
  1911. "erons",
  1912. "eront",
  1913. "aIent",
  1914. "antes",
  1915. "asses",
  1916. "ions",
  1917. "erai",
  1918. "eras",
  1919. "erez",
  1920. "\xE2mes",
  1921. "\xE2tes",
  1922. "ante",
  1923. "ants",
  1924. "asse",
  1925. "\xE9es",
  1926. "era",
  1927. "iez",
  1928. "ais",
  1929. "ait",
  1930. "ant",
  1931. "\xE9e",
  1932. "\xE9s",
  1933. "er",
  1934. "ez",
  1935. "\xE2t",
  1936. "ai",
  1937. "as",
  1938. "\xE9",
  1939. "a",
  1940. )
  1941. __step4_suffixes = ("i\xE8re", "I\xE8re", "ion", "ier", "Ier", "e", "\xEB")
  1942. def stem(self, word):
  1943. """
  1944. Stem a French word and return the stemmed form.
  1945. :param word: The word that is stemmed.
  1946. :type word: str or unicode
  1947. :return: The stemmed form.
  1948. :rtype: unicode
  1949. """
  1950. word = word.lower()
  1951. if word in self.stopwords:
  1952. return word
  1953. step1_success = False
  1954. rv_ending_found = False
  1955. step2a_success = False
  1956. step2b_success = False
  1957. # Every occurrence of 'u' after 'q' is put into upper case.
  1958. for i in range(1, len(word)):
  1959. if word[i - 1] == "q" and word[i] == "u":
  1960. word = "".join((word[:i], "U", word[i + 1 :]))
  1961. # Every occurrence of 'u' and 'i'
  1962. # between vowels is put into upper case.
  1963. # Every occurrence of 'y' preceded or
  1964. # followed by a vowel is also put into upper case.
  1965. for i in range(1, len(word) - 1):
  1966. if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
  1967. if word[i] == "u":
  1968. word = "".join((word[:i], "U", word[i + 1 :]))
  1969. elif word[i] == "i":
  1970. word = "".join((word[:i], "I", word[i + 1 :]))
  1971. if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels:
  1972. if word[i] == "y":
  1973. word = "".join((word[:i], "Y", word[i + 1 :]))
  1974. r1, r2 = self._r1r2_standard(word, self.__vowels)
  1975. rv = self.__rv_french(word, self.__vowels)
  1976. # STEP 1: Standard suffix removal
  1977. for suffix in self.__step1_suffixes:
  1978. if word.endswith(suffix):
  1979. if suffix == "eaux":
  1980. word = word[:-1]
  1981. step1_success = True
  1982. elif suffix in ("euse", "euses"):
  1983. if suffix in r2:
  1984. word = word[: -len(suffix)]
  1985. step1_success = True
  1986. elif suffix in r1:
  1987. word = suffix_replace(word, suffix, "eux")
  1988. step1_success = True
  1989. elif suffix in ("ement", "ements") and suffix in rv:
  1990. word = word[: -len(suffix)]
  1991. step1_success = True
  1992. if word[-2:] == "iv" and "iv" in r2:
  1993. word = word[:-2]
  1994. if word[-2:] == "at" and "at" in r2:
  1995. word = word[:-2]
  1996. elif word[-3:] == "eus":
  1997. if "eus" in r2:
  1998. word = word[:-3]
  1999. elif "eus" in r1:
  2000. word = "".join((word[:-1], "x"))
  2001. elif word[-3:] in ("abl", "iqU"):
  2002. if "abl" in r2 or "iqU" in r2:
  2003. word = word[:-3]
  2004. elif word[-3:] in ("i\xE8r", "I\xE8r"):
  2005. if "i\xE8r" in rv or "I\xE8r" in rv:
  2006. word = "".join((word[:-3], "i"))
  2007. elif suffix == "amment" and suffix in rv:
  2008. word = suffix_replace(word, "amment", "ant")
  2009. rv = suffix_replace(rv, "amment", "ant")
  2010. rv_ending_found = True
  2011. elif suffix == "emment" and suffix in rv:
  2012. word = suffix_replace(word, "emment", "ent")
  2013. rv_ending_found = True
  2014. elif (
  2015. suffix in ("ment", "ments")
  2016. and suffix in rv
  2017. and not rv.startswith(suffix)
  2018. and rv[rv.rindex(suffix) - 1] in self.__vowels
  2019. ):
  2020. word = word[: -len(suffix)]
  2021. rv = rv[: -len(suffix)]
  2022. rv_ending_found = True
  2023. elif suffix == "aux" and suffix in r1:
  2024. word = "".join((word[:-2], "l"))
  2025. step1_success = True
  2026. elif (
  2027. suffix in ("issement", "issements")
  2028. and suffix in r1
  2029. and word[-len(suffix) - 1] not in self.__vowels
  2030. ):
  2031. word = word[: -len(suffix)]
  2032. step1_success = True
  2033. elif (
  2034. suffix
  2035. in (
  2036. "ance",
  2037. "iqUe",
  2038. "isme",
  2039. "able",
  2040. "iste",
  2041. "eux",
  2042. "ances",
  2043. "iqUes",
  2044. "ismes",
  2045. "ables",
  2046. "istes",
  2047. )
  2048. and suffix in r2
  2049. ):
  2050. word = word[: -len(suffix)]
  2051. step1_success = True
  2052. elif (
  2053. suffix
  2054. in ("atrice", "ateur", "ation", "atrices", "ateurs", "ations")
  2055. and suffix in r2
  2056. ):
  2057. word = word[: -len(suffix)]
  2058. step1_success = True
  2059. if word[-2:] == "ic":
  2060. if "ic" in r2:
  2061. word = word[:-2]
  2062. else:
  2063. word = "".join((word[:-2], "iqU"))
  2064. elif suffix in ("logie", "logies") and suffix in r2:
  2065. word = suffix_replace(word, suffix, "log")
  2066. step1_success = True
  2067. elif suffix in ("usion", "ution", "usions", "utions") and suffix in r2:
  2068. word = suffix_replace(word, suffix, "u")
  2069. step1_success = True
  2070. elif suffix in ("ence", "ences") and suffix in r2:
  2071. word = suffix_replace(word, suffix, "ent")
  2072. step1_success = True
  2073. elif suffix in ("it\xE9", "it\xE9s") and suffix in r2:
  2074. word = word[: -len(suffix)]
  2075. step1_success = True
  2076. if word[-4:] == "abil":
  2077. if "abil" in r2:
  2078. word = word[:-4]
  2079. else:
  2080. word = "".join((word[:-2], "l"))
  2081. elif word[-2:] == "ic":
  2082. if "ic" in r2:
  2083. word = word[:-2]
  2084. else:
  2085. word = "".join((word[:-2], "iqU"))
  2086. elif word[-2:] == "iv":
  2087. if "iv" in r2:
  2088. word = word[:-2]
  2089. elif suffix in ("if", "ive", "ifs", "ives") and suffix in r2:
  2090. word = word[: -len(suffix)]
  2091. step1_success = True
  2092. if word[-2:] == "at" and "at" in r2:
  2093. word = word[:-2]
  2094. if word[-2:] == "ic":
  2095. if "ic" in r2:
  2096. word = word[:-2]
  2097. else:
  2098. word = "".join((word[:-2], "iqU"))
  2099. break
  2100. # STEP 2a: Verb suffixes beginning 'i'
  2101. if not step1_success or rv_ending_found:
  2102. for suffix in self.__step2a_suffixes:
  2103. if word.endswith(suffix):
  2104. if (
  2105. suffix in rv
  2106. and len(rv) > len(suffix)
  2107. and rv[rv.rindex(suffix) - 1] not in self.__vowels
  2108. ):
  2109. word = word[: -len(suffix)]
  2110. step2a_success = True
  2111. break
  2112. # STEP 2b: Other verb suffixes
  2113. if not step2a_success:
  2114. for suffix in self.__step2b_suffixes:
  2115. if rv.endswith(suffix):
  2116. if suffix == "ions" and "ions" in r2:
  2117. word = word[:-4]
  2118. step2b_success = True
  2119. elif suffix in (
  2120. "eraIent",
  2121. "erions",
  2122. "\xE8rent",
  2123. "erais",
  2124. "erait",
  2125. "eriez",
  2126. "erons",
  2127. "eront",
  2128. "erai",
  2129. "eras",
  2130. "erez",
  2131. "\xE9es",
  2132. "era",
  2133. "iez",
  2134. "\xE9e",
  2135. "\xE9s",
  2136. "er",
  2137. "ez",
  2138. "\xE9",
  2139. ):
  2140. word = word[: -len(suffix)]
  2141. step2b_success = True
  2142. elif suffix in (
  2143. "assions",
  2144. "assent",
  2145. "assiez",
  2146. "aIent",
  2147. "antes",
  2148. "asses",
  2149. "\xE2mes",
  2150. "\xE2tes",
  2151. "ante",
  2152. "ants",
  2153. "asse",
  2154. "ais",
  2155. "ait",
  2156. "ant",
  2157. "\xE2t",
  2158. "ai",
  2159. "as",
  2160. "a",
  2161. ):
  2162. word = word[: -len(suffix)]
  2163. rv = rv[: -len(suffix)]
  2164. step2b_success = True
  2165. if rv.endswith("e"):
  2166. word = word[:-1]
  2167. break
  2168. # STEP 3
  2169. if step1_success or step2a_success or step2b_success:
  2170. if word[-1] == "Y":
  2171. word = "".join((word[:-1], "i"))
  2172. elif word[-1] == "\xE7":
  2173. word = "".join((word[:-1], "c"))
  2174. # STEP 4: Residual suffixes
  2175. else:
  2176. if len(word) >= 2 and word[-1] == "s" and word[-2] not in "aiou\xE8s":
  2177. word = word[:-1]
  2178. for suffix in self.__step4_suffixes:
  2179. if word.endswith(suffix):
  2180. if suffix in rv:
  2181. if suffix == "ion" and suffix in r2 and rv[-4] in "st":
  2182. word = word[:-3]
  2183. elif suffix in ("ier", "i\xE8re", "Ier", "I\xE8re"):
  2184. word = suffix_replace(word, suffix, "i")
  2185. elif suffix == "e":
  2186. word = word[:-1]
  2187. elif suffix == "\xEB" and word[-3:-1] == "gu":
  2188. word = word[:-1]
  2189. break
  2190. # STEP 5: Undouble
  2191. if word.endswith(("enn", "onn", "ett", "ell", "eill")):
  2192. word = word[:-1]
  2193. # STEP 6: Un-accent
  2194. for i in range(1, len(word)):
  2195. if word[-i] not in self.__vowels:
  2196. i += 1
  2197. else:
  2198. if i != 1 and word[-i] in ("\xE9", "\xE8"):
  2199. word = "".join((word[:-i], "e", word[-i + 1 :]))
  2200. break
  2201. word = word.replace("I", "i").replace("U", "u").replace("Y", "y")
  2202. return word
  2203. def __rv_french(self, word, vowels):
  2204. """
  2205. Return the region RV that is used by the French stemmer.
  2206. If the word begins with two vowels, RV is the region after
  2207. the third letter. Otherwise, it is the region after the first
  2208. vowel not at the beginning of the word, or the end of the word
  2209. if these positions cannot be found. (Exceptionally, u'par',
  2210. u'col' or u'tap' at the beginning of a word is also taken to
  2211. define RV as the region to their right.)
  2212. :param word: The French word whose region RV is determined.
  2213. :type word: str or unicode
  2214. :param vowels: The French vowels that are used to determine
  2215. the region RV.
  2216. :type vowels: unicode
  2217. :return: the region RV for the respective French word.
  2218. :rtype: unicode
  2219. :note: This helper method is invoked by the stem method of
  2220. the subclass FrenchStemmer. It is not to be invoked directly!
  2221. """
  2222. rv = ""
  2223. if len(word) >= 2:
  2224. if word.startswith(("par", "col", "tap")) or (
  2225. word[0] in vowels and word[1] in vowels
  2226. ):
  2227. rv = word[3:]
  2228. else:
  2229. for i in range(1, len(word)):
  2230. if word[i] in vowels:
  2231. rv = word[i + 1 :]
  2232. break
  2233. return rv
  2234. class GermanStemmer(_StandardStemmer):
  2235. """
  2236. The German Snowball stemmer.
  2237. :cvar __vowels: The German vowels.
  2238. :type __vowels: unicode
  2239. :cvar __s_ending: Letters that may directly appear before a word final 's'.
  2240. :type __s_ending: unicode
  2241. :cvar __st_ending: Letter that may directly appear before a word final 'st'.
  2242. :type __st_ending: unicode
  2243. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  2244. :type __step1_suffixes: tuple
  2245. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  2246. :type __step2_suffixes: tuple
  2247. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  2248. :type __step3_suffixes: tuple
  2249. :note: A detailed description of the German
  2250. stemming algorithm can be found under
  2251. http://snowball.tartarus.org/algorithms/german/stemmer.html
  2252. """
  2253. __vowels = "aeiouy\xE4\xF6\xFC"
  2254. __s_ending = "bdfghklmnrt"
  2255. __st_ending = "bdfghklmnt"
  2256. __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s")
  2257. __step2_suffixes = ("est", "en", "er", "st")
  2258. __step3_suffixes = ("isch", "lich", "heit", "keit", "end", "ung", "ig", "ik")
  2259. def stem(self, word):
  2260. """
  2261. Stem a German word and return the stemmed form.
  2262. :param word: The word that is stemmed.
  2263. :type word: str or unicode
  2264. :return: The stemmed form.
  2265. :rtype: unicode
  2266. """
  2267. word = word.lower()
  2268. if word in self.stopwords:
  2269. return word
  2270. word = word.replace("\xDF", "ss")
  2271. # Every occurrence of 'u' and 'y'
  2272. # between vowels is put into upper case.
  2273. for i in range(1, len(word) - 1):
  2274. if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
  2275. if word[i] == "u":
  2276. word = "".join((word[:i], "U", word[i + 1 :]))
  2277. elif word[i] == "y":
  2278. word = "".join((word[:i], "Y", word[i + 1 :]))
  2279. r1, r2 = self._r1r2_standard(word, self.__vowels)
  2280. # R1 is adjusted so that the region before it
  2281. # contains at least 3 letters.
  2282. for i in range(1, len(word)):
  2283. if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
  2284. if 3 > len(word[: i + 1]) > 0:
  2285. r1 = word[3:]
  2286. elif len(word[: i + 1]) == 0:
  2287. return word
  2288. break
  2289. # STEP 1
  2290. for suffix in self.__step1_suffixes:
  2291. if r1.endswith(suffix):
  2292. if (
  2293. suffix in ("en", "es", "e")
  2294. and word[-len(suffix) - 4 : -len(suffix)] == "niss"
  2295. ):
  2296. word = word[: -len(suffix) - 1]
  2297. r1 = r1[: -len(suffix) - 1]
  2298. r2 = r2[: -len(suffix) - 1]
  2299. elif suffix == "s":
  2300. if word[-2] in self.__s_ending:
  2301. word = word[:-1]
  2302. r1 = r1[:-1]
  2303. r2 = r2[:-1]
  2304. else:
  2305. word = word[: -len(suffix)]
  2306. r1 = r1[: -len(suffix)]
  2307. r2 = r2[: -len(suffix)]
  2308. break
  2309. # STEP 2
  2310. for suffix in self.__step2_suffixes:
  2311. if r1.endswith(suffix):
  2312. if suffix == "st":
  2313. if word[-3] in self.__st_ending and len(word[:-3]) >= 3:
  2314. word = word[:-2]
  2315. r1 = r1[:-2]
  2316. r2 = r2[:-2]
  2317. else:
  2318. word = word[: -len(suffix)]
  2319. r1 = r1[: -len(suffix)]
  2320. r2 = r2[: -len(suffix)]
  2321. break
  2322. # STEP 3: Derivational suffixes
  2323. for suffix in self.__step3_suffixes:
  2324. if r2.endswith(suffix):
  2325. if suffix in ("end", "ung"):
  2326. if (
  2327. "ig" in r2[-len(suffix) - 2 : -len(suffix)]
  2328. and "e" not in r2[-len(suffix) - 3 : -len(suffix) - 2]
  2329. ):
  2330. word = word[: -len(suffix) - 2]
  2331. else:
  2332. word = word[: -len(suffix)]
  2333. elif (
  2334. suffix in ("ig", "ik", "isch")
  2335. and "e" not in r2[-len(suffix) - 1 : -len(suffix)]
  2336. ):
  2337. word = word[: -len(suffix)]
  2338. elif suffix in ("lich", "heit"):
  2339. if (
  2340. "er" in r1[-len(suffix) - 2 : -len(suffix)]
  2341. or "en" in r1[-len(suffix) - 2 : -len(suffix)]
  2342. ):
  2343. word = word[: -len(suffix) - 2]
  2344. else:
  2345. word = word[: -len(suffix)]
  2346. elif suffix == "keit":
  2347. if "lich" in r2[-len(suffix) - 4 : -len(suffix)]:
  2348. word = word[: -len(suffix) - 4]
  2349. elif "ig" in r2[-len(suffix) - 2 : -len(suffix)]:
  2350. word = word[: -len(suffix) - 2]
  2351. else:
  2352. word = word[: -len(suffix)]
  2353. break
  2354. # Umlaut accents are removed and
  2355. # 'u' and 'y' are put back into lower case.
  2356. word = (
  2357. word.replace("\xE4", "a")
  2358. .replace("\xF6", "o")
  2359. .replace("\xFC", "u")
  2360. .replace("U", "u")
  2361. .replace("Y", "y")
  2362. )
  2363. return word
  2364. class HungarianStemmer(_LanguageSpecificStemmer):
  2365. """
  2366. The Hungarian Snowball stemmer.
  2367. :cvar __vowels: The Hungarian vowels.
  2368. :type __vowels: unicode
  2369. :cvar __digraphs: The Hungarian digraphs.
  2370. :type __digraphs: tuple
  2371. :cvar __double_consonants: The Hungarian double consonants.
  2372. :type __double_consonants: tuple
  2373. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  2374. :type __step1_suffixes: tuple
  2375. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  2376. :type __step2_suffixes: tuple
  2377. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  2378. :type __step3_suffixes: tuple
  2379. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  2380. :type __step4_suffixes: tuple
  2381. :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
  2382. :type __step5_suffixes: tuple
  2383. :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm.
  2384. :type __step6_suffixes: tuple
  2385. :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm.
  2386. :type __step7_suffixes: tuple
  2387. :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm.
  2388. :type __step8_suffixes: tuple
  2389. :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm.
  2390. :type __step9_suffixes: tuple
  2391. :note: A detailed description of the Hungarian
  2392. stemming algorithm can be found under
  2393. http://snowball.tartarus.org/algorithms/hungarian/stemmer.html
  2394. """
  2395. __vowels = "aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB"
  2396. __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs")
  2397. __double_consonants = (
  2398. "bb",
  2399. "cc",
  2400. "ccs",
  2401. "dd",
  2402. "ff",
  2403. "gg",
  2404. "ggy",
  2405. "jj",
  2406. "kk",
  2407. "ll",
  2408. "lly",
  2409. "mm",
  2410. "nn",
  2411. "nny",
  2412. "pp",
  2413. "rr",
  2414. "ss",
  2415. "ssz",
  2416. "tt",
  2417. "tty",
  2418. "vv",
  2419. "zz",
  2420. "zzs",
  2421. )
  2422. __step1_suffixes = ("al", "el")
  2423. __step2_suffixes = (
  2424. "k\xE9ppen",
  2425. "onk\xE9nt",
  2426. "enk\xE9nt",
  2427. "ank\xE9nt",
  2428. "k\xE9pp",
  2429. "k\xE9nt",
  2430. "ban",
  2431. "ben",
  2432. "nak",
  2433. "nek",
  2434. "val",
  2435. "vel",
  2436. "t\xF3l",
  2437. "t\xF5l",
  2438. "r\xF3l",
  2439. "r\xF5l",
  2440. "b\xF3l",
  2441. "b\xF5l",
  2442. "hoz",
  2443. "hez",
  2444. "h\xF6z",
  2445. "n\xE1l",
  2446. "n\xE9l",
  2447. "\xE9rt",
  2448. "kor",
  2449. "ba",
  2450. "be",
  2451. "ra",
  2452. "re",
  2453. "ig",
  2454. "at",
  2455. "et",
  2456. "ot",
  2457. "\xF6t",
  2458. "ul",
  2459. "\xFCl",
  2460. "v\xE1",
  2461. "v\xE9",
  2462. "en",
  2463. "on",
  2464. "an",
  2465. "\xF6n",
  2466. "n",
  2467. "t",
  2468. )
  2469. __step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n")
  2470. __step4_suffixes = (
  2471. "astul",
  2472. "est\xFCl",
  2473. "\xE1stul",
  2474. "\xE9st\xFCl",
  2475. "stul",
  2476. "st\xFCl",
  2477. )
  2478. __step5_suffixes = ("\xE1", "\xE9")
  2479. __step6_suffixes = (
  2480. "ok\xE9",
  2481. "\xF6k\xE9",
  2482. "ak\xE9",
  2483. "ek\xE9",
  2484. "\xE1k\xE9",
  2485. "\xE1\xE9i",
  2486. "\xE9k\xE9",
  2487. "\xE9\xE9i",
  2488. "k\xE9",
  2489. "\xE9i",
  2490. "\xE9\xE9",
  2491. "\xE9",
  2492. )
  2493. __step7_suffixes = (
  2494. "\xE1juk",
  2495. "\xE9j\xFCk",
  2496. "\xFCnk",
  2497. "unk",
  2498. "juk",
  2499. "j\xFCk",
  2500. "\xE1nk",
  2501. "\xE9nk",
  2502. "nk",
  2503. "uk",
  2504. "\xFCk",
  2505. "em",
  2506. "om",
  2507. "am",
  2508. "od",
  2509. "ed",
  2510. "ad",
  2511. "\xF6d",
  2512. "ja",
  2513. "je",
  2514. "\xE1m",
  2515. "\xE1d",
  2516. "\xE9m",
  2517. "\xE9d",
  2518. "m",
  2519. "d",
  2520. "a",
  2521. "e",
  2522. "o",
  2523. "\xE1",
  2524. "\xE9",
  2525. )
  2526. __step8_suffixes = (
  2527. "jaitok",
  2528. "jeitek",
  2529. "jaink",
  2530. "jeink",
  2531. "aitok",
  2532. "eitek",
  2533. "\xE1itok",
  2534. "\xE9itek",
  2535. "jaim",
  2536. "jeim",
  2537. "jaid",
  2538. "jeid",
  2539. "eink",
  2540. "aink",
  2541. "itek",
  2542. "jeik",
  2543. "jaik",
  2544. "\xE1ink",
  2545. "\xE9ink",
  2546. "aim",
  2547. "eim",
  2548. "aid",
  2549. "eid",
  2550. "jai",
  2551. "jei",
  2552. "ink",
  2553. "aik",
  2554. "eik",
  2555. "\xE1im",
  2556. "\xE1id",
  2557. "\xE1ik",
  2558. "\xE9im",
  2559. "\xE9id",
  2560. "\xE9ik",
  2561. "im",
  2562. "id",
  2563. "ai",
  2564. "ei",
  2565. "ik",
  2566. "\xE1i",
  2567. "\xE9i",
  2568. "i",
  2569. )
  2570. __step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", "ek", "ak", "k")
  2571. def stem(self, word):
  2572. """
  2573. Stem an Hungarian word and return the stemmed form.
  2574. :param word: The word that is stemmed.
  2575. :type word: str or unicode
  2576. :return: The stemmed form.
  2577. :rtype: unicode
  2578. """
  2579. word = word.lower()
  2580. if word in self.stopwords:
  2581. return word
  2582. r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs)
  2583. # STEP 1: Remove instrumental case
  2584. if r1.endswith(self.__step1_suffixes):
  2585. for double_cons in self.__double_consonants:
  2586. if word[-2 - len(double_cons) : -2] == double_cons:
  2587. word = "".join((word[:-4], word[-3]))
  2588. if r1[-2 - len(double_cons) : -2] == double_cons:
  2589. r1 = "".join((r1[:-4], r1[-3]))
  2590. break
  2591. # STEP 2: Remove frequent cases
  2592. for suffix in self.__step2_suffixes:
  2593. if word.endswith(suffix):
  2594. if r1.endswith(suffix):
  2595. word = word[: -len(suffix)]
  2596. r1 = r1[: -len(suffix)]
  2597. if r1.endswith("\xE1"):
  2598. word = "".join((word[:-1], "a"))
  2599. r1 = suffix_replace(r1, "\xE1", "a")
  2600. elif r1.endswith("\xE9"):
  2601. word = "".join((word[:-1], "e"))
  2602. r1 = suffix_replace(r1, "\xE9", "e")
  2603. break
  2604. # STEP 3: Remove special cases
  2605. for suffix in self.__step3_suffixes:
  2606. if r1.endswith(suffix):
  2607. if suffix == "\xE9n":
  2608. word = suffix_replace(word, suffix, "e")
  2609. r1 = suffix_replace(r1, suffix, "e")
  2610. else:
  2611. word = suffix_replace(word, suffix, "a")
  2612. r1 = suffix_replace(r1, suffix, "a")
  2613. break
  2614. # STEP 4: Remove other cases
  2615. for suffix in self.__step4_suffixes:
  2616. if r1.endswith(suffix):
  2617. if suffix == "\xE1stul":
  2618. word = suffix_replace(word, suffix, "a")
  2619. r1 = suffix_replace(r1, suffix, "a")
  2620. elif suffix == "\xE9st\xFCl":
  2621. word = suffix_replace(word, suffix, "e")
  2622. r1 = suffix_replace(r1, suffix, "e")
  2623. else:
  2624. word = word[: -len(suffix)]
  2625. r1 = r1[: -len(suffix)]
  2626. break
  2627. # STEP 5: Remove factive case
  2628. for suffix in self.__step5_suffixes:
  2629. if r1.endswith(suffix):
  2630. for double_cons in self.__double_consonants:
  2631. if word[-1 - len(double_cons) : -1] == double_cons:
  2632. word = "".join((word[:-3], word[-2]))
  2633. if r1[-1 - len(double_cons) : -1] == double_cons:
  2634. r1 = "".join((r1[:-3], r1[-2]))
  2635. break
  2636. # STEP 6: Remove owned
  2637. for suffix in self.__step6_suffixes:
  2638. if r1.endswith(suffix):
  2639. if suffix in ("\xE1k\xE9", "\xE1\xE9i"):
  2640. word = suffix_replace(word, suffix, "a")
  2641. r1 = suffix_replace(r1, suffix, "a")
  2642. elif suffix in ("\xE9k\xE9", "\xE9\xE9i", "\xE9\xE9"):
  2643. word = suffix_replace(word, suffix, "e")
  2644. r1 = suffix_replace(r1, suffix, "e")
  2645. else:
  2646. word = word[: -len(suffix)]
  2647. r1 = r1[: -len(suffix)]
  2648. break
  2649. # STEP 7: Remove singular owner suffixes
  2650. for suffix in self.__step7_suffixes:
  2651. if word.endswith(suffix):
  2652. if r1.endswith(suffix):
  2653. if suffix in ("\xE1nk", "\xE1juk", "\xE1m", "\xE1d", "\xE1"):
  2654. word = suffix_replace(word, suffix, "a")
  2655. r1 = suffix_replace(r1, suffix, "a")
  2656. elif suffix in ("\xE9nk", "\xE9j\xFCk", "\xE9m", "\xE9d", "\xE9"):
  2657. word = suffix_replace(word, suffix, "e")
  2658. r1 = suffix_replace(r1, suffix, "e")
  2659. else:
  2660. word = word[: -len(suffix)]
  2661. r1 = r1[: -len(suffix)]
  2662. break
  2663. # STEP 8: Remove plural owner suffixes
  2664. for suffix in self.__step8_suffixes:
  2665. if word.endswith(suffix):
  2666. if r1.endswith(suffix):
  2667. if suffix in (
  2668. "\xE1im",
  2669. "\xE1id",
  2670. "\xE1i",
  2671. "\xE1ink",
  2672. "\xE1itok",
  2673. "\xE1ik",
  2674. ):
  2675. word = suffix_replace(word, suffix, "a")
  2676. r1 = suffix_replace(r1, suffix, "a")
  2677. elif suffix in (
  2678. "\xE9im",
  2679. "\xE9id",
  2680. "\xE9i",
  2681. "\xE9ink",
  2682. "\xE9itek",
  2683. "\xE9ik",
  2684. ):
  2685. word = suffix_replace(word, suffix, "e")
  2686. r1 = suffix_replace(r1, suffix, "e")
  2687. else:
  2688. word = word[: -len(suffix)]
  2689. r1 = r1[: -len(suffix)]
  2690. break
  2691. # STEP 9: Remove plural suffixes
  2692. for suffix in self.__step9_suffixes:
  2693. if word.endswith(suffix):
  2694. if r1.endswith(suffix):
  2695. if suffix == "\xE1k":
  2696. word = suffix_replace(word, suffix, "a")
  2697. elif suffix == "\xE9k":
  2698. word = suffix_replace(word, suffix, "e")
  2699. else:
  2700. word = word[: -len(suffix)]
  2701. break
  2702. return word
  2703. def __r1_hungarian(self, word, vowels, digraphs):
  2704. """
  2705. Return the region R1 that is used by the Hungarian stemmer.
  2706. If the word begins with a vowel, R1 is defined as the region
  2707. after the first consonant or digraph (= two letters stand for
  2708. one phoneme) in the word. If the word begins with a consonant,
  2709. it is defined as the region after the first vowel in the word.
  2710. If the word does not contain both a vowel and consonant, R1
  2711. is the null region at the end of the word.
  2712. :param word: The Hungarian word whose region R1 is determined.
  2713. :type word: str or unicode
  2714. :param vowels: The Hungarian vowels that are used to determine
  2715. the region R1.
  2716. :type vowels: unicode
  2717. :param digraphs: The digraphs that are used to determine the
  2718. region R1.
  2719. :type digraphs: tuple
  2720. :return: the region R1 for the respective word.
  2721. :rtype: unicode
  2722. :note: This helper method is invoked by the stem method of the subclass
  2723. HungarianStemmer. It is not to be invoked directly!
  2724. """
  2725. r1 = ""
  2726. if word[0] in vowels:
  2727. for digraph in digraphs:
  2728. if digraph in word[1:]:
  2729. r1 = word[word.index(digraph[-1]) + 1 :]
  2730. return r1
  2731. for i in range(1, len(word)):
  2732. if word[i] not in vowels:
  2733. r1 = word[i + 1 :]
  2734. break
  2735. else:
  2736. for i in range(1, len(word)):
  2737. if word[i] in vowels:
  2738. r1 = word[i + 1 :]
  2739. break
  2740. return r1
  2741. class ItalianStemmer(_StandardStemmer):
  2742. """
  2743. The Italian Snowball stemmer.
  2744. :cvar __vowels: The Italian vowels.
  2745. :type __vowels: unicode
  2746. :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
  2747. :type __step0_suffixes: tuple
  2748. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  2749. :type __step1_suffixes: tuple
  2750. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  2751. :type __step2_suffixes: tuple
  2752. :note: A detailed description of the Italian
  2753. stemming algorithm can be found under
  2754. http://snowball.tartarus.org/algorithms/italian/stemmer.html
  2755. """
  2756. __vowels = "aeiou\xE0\xE8\xEC\xF2\xF9"
  2757. __step0_suffixes = (
  2758. "gliela",
  2759. "gliele",
  2760. "glieli",
  2761. "glielo",
  2762. "gliene",
  2763. "sene",
  2764. "mela",
  2765. "mele",
  2766. "meli",
  2767. "melo",
  2768. "mene",
  2769. "tela",
  2770. "tele",
  2771. "teli",
  2772. "telo",
  2773. "tene",
  2774. "cela",
  2775. "cele",
  2776. "celi",
  2777. "celo",
  2778. "cene",
  2779. "vela",
  2780. "vele",
  2781. "veli",
  2782. "velo",
  2783. "vene",
  2784. "gli",
  2785. "ci",
  2786. "la",
  2787. "le",
  2788. "li",
  2789. "lo",
  2790. "mi",
  2791. "ne",
  2792. "si",
  2793. "ti",
  2794. "vi",
  2795. )
  2796. __step1_suffixes = (
  2797. "atrice",
  2798. "atrici",
  2799. "azione",
  2800. "azioni",
  2801. "uzione",
  2802. "uzioni",
  2803. "usione",
  2804. "usioni",
  2805. "amento",
  2806. "amenti",
  2807. "imento",
  2808. "imenti",
  2809. "amente",
  2810. "abile",
  2811. "abili",
  2812. "ibile",
  2813. "ibili",
  2814. "mente",
  2815. "atore",
  2816. "atori",
  2817. "logia",
  2818. "logie",
  2819. "anza",
  2820. "anze",
  2821. "iche",
  2822. "ichi",
  2823. "ismo",
  2824. "ismi",
  2825. "ista",
  2826. "iste",
  2827. "isti",
  2828. "ist\xE0",
  2829. "ist\xE8",
  2830. "ist\xEC",
  2831. "ante",
  2832. "anti",
  2833. "enza",
  2834. "enze",
  2835. "ico",
  2836. "ici",
  2837. "ica",
  2838. "ice",
  2839. "oso",
  2840. "osi",
  2841. "osa",
  2842. "ose",
  2843. "it\xE0",
  2844. "ivo",
  2845. "ivi",
  2846. "iva",
  2847. "ive",
  2848. )
  2849. __step2_suffixes = (
  2850. "erebbero",
  2851. "irebbero",
  2852. "assero",
  2853. "assimo",
  2854. "eranno",
  2855. "erebbe",
  2856. "eremmo",
  2857. "ereste",
  2858. "eresti",
  2859. "essero",
  2860. "iranno",
  2861. "irebbe",
  2862. "iremmo",
  2863. "ireste",
  2864. "iresti",
  2865. "iscano",
  2866. "iscono",
  2867. "issero",
  2868. "arono",
  2869. "avamo",
  2870. "avano",
  2871. "avate",
  2872. "eremo",
  2873. "erete",
  2874. "erono",
  2875. "evamo",
  2876. "evano",
  2877. "evate",
  2878. "iremo",
  2879. "irete",
  2880. "irono",
  2881. "ivamo",
  2882. "ivano",
  2883. "ivate",
  2884. "ammo",
  2885. "ando",
  2886. "asse",
  2887. "assi",
  2888. "emmo",
  2889. "enda",
  2890. "ende",
  2891. "endi",
  2892. "endo",
  2893. "erai",
  2894. "erei",
  2895. "Yamo",
  2896. "iamo",
  2897. "immo",
  2898. "irai",
  2899. "irei",
  2900. "isca",
  2901. "isce",
  2902. "isci",
  2903. "isco",
  2904. "ano",
  2905. "are",
  2906. "ata",
  2907. "ate",
  2908. "ati",
  2909. "ato",
  2910. "ava",
  2911. "avi",
  2912. "avo",
  2913. "er\xE0",
  2914. "ere",
  2915. "er\xF2",
  2916. "ete",
  2917. "eva",
  2918. "evi",
  2919. "evo",
  2920. "ir\xE0",
  2921. "ire",
  2922. "ir\xF2",
  2923. "ita",
  2924. "ite",
  2925. "iti",
  2926. "ito",
  2927. "iva",
  2928. "ivi",
  2929. "ivo",
  2930. "ono",
  2931. "uta",
  2932. "ute",
  2933. "uti",
  2934. "uto",
  2935. "ar",
  2936. "ir",
  2937. )
  2938. def stem(self, word):
  2939. """
  2940. Stem an Italian word and return the stemmed form.
  2941. :param word: The word that is stemmed.
  2942. :type word: str or unicode
  2943. :return: The stemmed form.
  2944. :rtype: unicode
  2945. """
  2946. word = word.lower()
  2947. if word in self.stopwords:
  2948. return word
  2949. step1_success = False
  2950. # All acute accents are replaced by grave accents.
  2951. word = (
  2952. word.replace("\xE1", "\xE0")
  2953. .replace("\xE9", "\xE8")
  2954. .replace("\xED", "\xEC")
  2955. .replace("\xF3", "\xF2")
  2956. .replace("\xFA", "\xF9")
  2957. )
  2958. # Every occurrence of 'u' after 'q'
  2959. # is put into upper case.
  2960. for i in range(1, len(word)):
  2961. if word[i - 1] == "q" and word[i] == "u":
  2962. word = "".join((word[:i], "U", word[i + 1 :]))
  2963. # Every occurrence of 'u' and 'i'
  2964. # between vowels is put into upper case.
  2965. for i in range(1, len(word) - 1):
  2966. if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
  2967. if word[i] == "u":
  2968. word = "".join((word[:i], "U", word[i + 1 :]))
  2969. elif word[i] == "i":
  2970. word = "".join((word[:i], "I", word[i + 1 :]))
  2971. r1, r2 = self._r1r2_standard(word, self.__vowels)
  2972. rv = self._rv_standard(word, self.__vowels)
  2973. # STEP 0: Attached pronoun
  2974. for suffix in self.__step0_suffixes:
  2975. if rv.endswith(suffix):
  2976. if rv[-len(suffix) - 4 : -len(suffix)] in ("ando", "endo"):
  2977. word = word[: -len(suffix)]
  2978. r1 = r1[: -len(suffix)]
  2979. r2 = r2[: -len(suffix)]
  2980. rv = rv[: -len(suffix)]
  2981. elif rv[-len(suffix) - 2 : -len(suffix)] in ("ar", "er", "ir"):
  2982. word = suffix_replace(word, suffix, "e")
  2983. r1 = suffix_replace(r1, suffix, "e")
  2984. r2 = suffix_replace(r2, suffix, "e")
  2985. rv = suffix_replace(rv, suffix, "e")
  2986. break
  2987. # STEP 1: Standard suffix removal
  2988. for suffix in self.__step1_suffixes:
  2989. if word.endswith(suffix):
  2990. if suffix == "amente" and r1.endswith(suffix):
  2991. step1_success = True
  2992. word = word[:-6]
  2993. r2 = r2[:-6]
  2994. rv = rv[:-6]
  2995. if r2.endswith("iv"):
  2996. word = word[:-2]
  2997. r2 = r2[:-2]
  2998. rv = rv[:-2]
  2999. if r2.endswith("at"):
  3000. word = word[:-2]
  3001. rv = rv[:-2]
  3002. elif r2.endswith(("os", "ic")):
  3003. word = word[:-2]
  3004. rv = rv[:-2]
  3005. elif r2.endswith("abil"):
  3006. word = word[:-4]
  3007. rv = rv[:-4]
  3008. elif suffix in ("amento", "amenti", "imento", "imenti") and rv.endswith(
  3009. suffix
  3010. ):
  3011. step1_success = True
  3012. word = word[:-6]
  3013. rv = rv[:-6]
  3014. elif r2.endswith(suffix):
  3015. step1_success = True
  3016. if suffix in ("azione", "azioni", "atore", "atori"):
  3017. word = word[: -len(suffix)]
  3018. r2 = r2[: -len(suffix)]
  3019. rv = rv[: -len(suffix)]
  3020. if r2.endswith("ic"):
  3021. word = word[:-2]
  3022. rv = rv[:-2]
  3023. elif suffix in ("logia", "logie"):
  3024. word = word[:-2]
  3025. rv = word[:-2]
  3026. elif suffix in ("uzione", "uzioni", "usione", "usioni"):
  3027. word = word[:-5]
  3028. rv = rv[:-5]
  3029. elif suffix in ("enza", "enze"):
  3030. word = suffix_replace(word, suffix, "te")
  3031. rv = suffix_replace(rv, suffix, "te")
  3032. elif suffix == "it\xE0":
  3033. word = word[:-3]
  3034. r2 = r2[:-3]
  3035. rv = rv[:-3]
  3036. if r2.endswith(("ic", "iv")):
  3037. word = word[:-2]
  3038. rv = rv[:-2]
  3039. elif r2.endswith("abil"):
  3040. word = word[:-4]
  3041. rv = rv[:-4]
  3042. elif suffix in ("ivo", "ivi", "iva", "ive"):
  3043. word = word[:-3]
  3044. r2 = r2[:-3]
  3045. rv = rv[:-3]
  3046. if r2.endswith("at"):
  3047. word = word[:-2]
  3048. r2 = r2[:-2]
  3049. rv = rv[:-2]
  3050. if r2.endswith("ic"):
  3051. word = word[:-2]
  3052. rv = rv[:-2]
  3053. else:
  3054. word = word[: -len(suffix)]
  3055. rv = rv[: -len(suffix)]
  3056. break
  3057. # STEP 2: Verb suffixes
  3058. if not step1_success:
  3059. for suffix in self.__step2_suffixes:
  3060. if rv.endswith(suffix):
  3061. word = word[: -len(suffix)]
  3062. rv = rv[: -len(suffix)]
  3063. break
  3064. # STEP 3a
  3065. if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8", "\xEC", "\xF2")):
  3066. word = word[:-1]
  3067. rv = rv[:-1]
  3068. if rv.endswith("i"):
  3069. word = word[:-1]
  3070. rv = rv[:-1]
  3071. # STEP 3b
  3072. if rv.endswith(("ch", "gh")):
  3073. word = word[:-1]
  3074. word = word.replace("I", "i").replace("U", "u")
  3075. return word
  3076. class NorwegianStemmer(_ScandinavianStemmer):
  3077. """
  3078. The Norwegian Snowball stemmer.
  3079. :cvar __vowels: The Norwegian vowels.
  3080. :type __vowels: unicode
  3081. :cvar __s_ending: Letters that may directly appear before a word final 's'.
  3082. :type __s_ending: unicode
  3083. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  3084. :type __step1_suffixes: tuple
  3085. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  3086. :type __step2_suffixes: tuple
  3087. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  3088. :type __step3_suffixes: tuple
  3089. :note: A detailed description of the Norwegian
  3090. stemming algorithm can be found under
  3091. http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
  3092. """
  3093. __vowels = "aeiouy\xE6\xE5\xF8"
  3094. __s_ending = "bcdfghjlmnoprtvyz"
  3095. __step1_suffixes = (
  3096. "hetenes",
  3097. "hetene",
  3098. "hetens",
  3099. "heter",
  3100. "heten",
  3101. "endes",
  3102. "ande",
  3103. "ende",
  3104. "edes",
  3105. "enes",
  3106. "erte",
  3107. "ede",
  3108. "ane",
  3109. "ene",
  3110. "ens",
  3111. "ers",
  3112. "ets",
  3113. "het",
  3114. "ast",
  3115. "ert",
  3116. "en",
  3117. "ar",
  3118. "er",
  3119. "as",
  3120. "es",
  3121. "et",
  3122. "a",
  3123. "e",
  3124. "s",
  3125. )
  3126. __step2_suffixes = ("dt", "vt")
  3127. __step3_suffixes = (
  3128. "hetslov",
  3129. "eleg",
  3130. "elig",
  3131. "elov",
  3132. "slov",
  3133. "leg",
  3134. "eig",
  3135. "lig",
  3136. "els",
  3137. "lov",
  3138. "ig",
  3139. )
  3140. def stem(self, word):
  3141. """
  3142. Stem a Norwegian word and return the stemmed form.
  3143. :param word: The word that is stemmed.
  3144. :type word: str or unicode
  3145. :return: The stemmed form.
  3146. :rtype: unicode
  3147. """
  3148. word = word.lower()
  3149. if word in self.stopwords:
  3150. return word
  3151. r1 = self._r1_scandinavian(word, self.__vowels)
  3152. # STEP 1
  3153. for suffix in self.__step1_suffixes:
  3154. if r1.endswith(suffix):
  3155. if suffix in ("erte", "ert"):
  3156. word = suffix_replace(word, suffix, "er")
  3157. r1 = suffix_replace(r1, suffix, "er")
  3158. elif suffix == "s":
  3159. if word[-2] in self.__s_ending or (
  3160. word[-2] == "k" and word[-3] not in self.__vowels
  3161. ):
  3162. word = word[:-1]
  3163. r1 = r1[:-1]
  3164. else:
  3165. word = word[: -len(suffix)]
  3166. r1 = r1[: -len(suffix)]
  3167. break
  3168. # STEP 2
  3169. for suffix in self.__step2_suffixes:
  3170. if r1.endswith(suffix):
  3171. word = word[:-1]
  3172. r1 = r1[:-1]
  3173. break
  3174. # STEP 3
  3175. for suffix in self.__step3_suffixes:
  3176. if r1.endswith(suffix):
  3177. word = word[: -len(suffix)]
  3178. break
  3179. return word
  3180. class PortugueseStemmer(_StandardStemmer):
  3181. """
  3182. The Portuguese Snowball stemmer.
  3183. :cvar __vowels: The Portuguese vowels.
  3184. :type __vowels: unicode
  3185. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  3186. :type __step1_suffixes: tuple
  3187. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  3188. :type __step2_suffixes: tuple
  3189. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  3190. :type __step4_suffixes: tuple
  3191. :note: A detailed description of the Portuguese
  3192. stemming algorithm can be found under
  3193. http://snowball.tartarus.org/algorithms/portuguese/stemmer.html
  3194. """
  3195. __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4"
  3196. __step1_suffixes = (
  3197. "amentos",
  3198. "imentos",
  3199. "uço~es",
  3200. "amento",
  3201. "imento",
  3202. "adoras",
  3203. "adores",
  3204. "a\xE7o~es",
  3205. "logias",
  3206. "\xEAncias",
  3207. "amente",
  3208. "idades",
  3209. "an\xE7as",
  3210. "ismos",
  3211. "istas",
  3212. "adora",
  3213. "a\xE7a~o",
  3214. "antes",
  3215. "\xE2ncia",
  3216. "logia",
  3217. "uça~o",
  3218. "\xEAncia",
  3219. "mente",
  3220. "idade",
  3221. "an\xE7a",
  3222. "ezas",
  3223. "icos",
  3224. "icas",
  3225. "ismo",
  3226. "\xE1vel",
  3227. "\xEDvel",
  3228. "ista",
  3229. "osos",
  3230. "osas",
  3231. "ador",
  3232. "ante",
  3233. "ivas",
  3234. "ivos",
  3235. "iras",
  3236. "eza",
  3237. "ico",
  3238. "ica",
  3239. "oso",
  3240. "osa",
  3241. "iva",
  3242. "ivo",
  3243. "ira",
  3244. )
  3245. __step2_suffixes = (
  3246. "ar\xEDamos",
  3247. "er\xEDamos",
  3248. "ir\xEDamos",
  3249. "\xE1ssemos",
  3250. "\xEAssemos",
  3251. "\xEDssemos",
  3252. "ar\xEDeis",
  3253. "er\xEDeis",
  3254. "ir\xEDeis",
  3255. "\xE1sseis",
  3256. "\xE9sseis",
  3257. "\xEDsseis",
  3258. "\xE1ramos",
  3259. "\xE9ramos",
  3260. "\xEDramos",
  3261. "\xE1vamos",
  3262. "aremos",
  3263. "eremos",
  3264. "iremos",
  3265. "ariam",
  3266. "eriam",
  3267. "iriam",
  3268. "assem",
  3269. "essem",
  3270. "issem",
  3271. "ara~o",
  3272. "era~o",
  3273. "ira~o",
  3274. "arias",
  3275. "erias",
  3276. "irias",
  3277. "ardes",
  3278. "erdes",
  3279. "irdes",
  3280. "asses",
  3281. "esses",
  3282. "isses",
  3283. "astes",
  3284. "estes",
  3285. "istes",
  3286. "\xE1reis",
  3287. "areis",
  3288. "\xE9reis",
  3289. "ereis",
  3290. "\xEDreis",
  3291. "ireis",
  3292. "\xE1veis",
  3293. "\xEDamos",
  3294. "armos",
  3295. "ermos",
  3296. "irmos",
  3297. "aria",
  3298. "eria",
  3299. "iria",
  3300. "asse",
  3301. "esse",
  3302. "isse",
  3303. "aste",
  3304. "este",
  3305. "iste",
  3306. "arei",
  3307. "erei",
  3308. "irei",
  3309. "aram",
  3310. "eram",
  3311. "iram",
  3312. "avam",
  3313. "arem",
  3314. "erem",
  3315. "irem",
  3316. "ando",
  3317. "endo",
  3318. "indo",
  3319. "adas",
  3320. "idas",
  3321. "ar\xE1s",
  3322. "aras",
  3323. "er\xE1s",
  3324. "eras",
  3325. "ir\xE1s",
  3326. "avas",
  3327. "ares",
  3328. "eres",
  3329. "ires",
  3330. "\xEDeis",
  3331. "ados",
  3332. "idos",
  3333. "\xE1mos",
  3334. "amos",
  3335. "emos",
  3336. "imos",
  3337. "iras",
  3338. "ada",
  3339. "ida",
  3340. "ar\xE1",
  3341. "ara",
  3342. "er\xE1",
  3343. "era",
  3344. "ir\xE1",
  3345. "ava",
  3346. "iam",
  3347. "ado",
  3348. "ido",
  3349. "ias",
  3350. "ais",
  3351. "eis",
  3352. "ira",
  3353. "ia",
  3354. "ei",
  3355. "am",
  3356. "em",
  3357. "ar",
  3358. "er",
  3359. "ir",
  3360. "as",
  3361. "es",
  3362. "is",
  3363. "eu",
  3364. "iu",
  3365. "ou",
  3366. )
  3367. __step4_suffixes = ("os", "a", "i", "o", "\xE1", "\xED", "\xF3")
  3368. def stem(self, word):
  3369. """
  3370. Stem a Portuguese word and return the stemmed form.
  3371. :param word: The word that is stemmed.
  3372. :type word: str or unicode
  3373. :return: The stemmed form.
  3374. :rtype: unicode
  3375. """
  3376. word = word.lower()
  3377. if word in self.stopwords:
  3378. return word
  3379. step1_success = False
  3380. step2_success = False
  3381. word = (
  3382. word.replace("\xE3", "a~")
  3383. .replace("\xF5", "o~")
  3384. .replace("q\xFC", "qu")
  3385. .replace("g\xFC", "gu")
  3386. )
  3387. r1, r2 = self._r1r2_standard(word, self.__vowels)
  3388. rv = self._rv_standard(word, self.__vowels)
  3389. # STEP 1: Standard suffix removal
  3390. for suffix in self.__step1_suffixes:
  3391. if word.endswith(suffix):
  3392. if suffix == "amente" and r1.endswith(suffix):
  3393. step1_success = True
  3394. word = word[:-6]
  3395. r2 = r2[:-6]
  3396. rv = rv[:-6]
  3397. if r2.endswith("iv"):
  3398. word = word[:-2]
  3399. r2 = r2[:-2]
  3400. rv = rv[:-2]
  3401. if r2.endswith("at"):
  3402. word = word[:-2]
  3403. rv = rv[:-2]
  3404. elif r2.endswith(("os", "ic", "ad")):
  3405. word = word[:-2]
  3406. rv = rv[:-2]
  3407. elif (
  3408. suffix in ("ira", "iras")
  3409. and rv.endswith(suffix)
  3410. and word[-len(suffix) - 1 : -len(suffix)] == "e"
  3411. ):
  3412. step1_success = True
  3413. word = suffix_replace(word, suffix, "ir")
  3414. rv = suffix_replace(rv, suffix, "ir")
  3415. elif r2.endswith(suffix):
  3416. step1_success = True
  3417. if suffix in ("logia", "logias"):
  3418. word = suffix_replace(word, suffix, "log")
  3419. rv = suffix_replace(rv, suffix, "log")
  3420. elif suffix in ("uça~o", "uço~es"):
  3421. word = suffix_replace(word, suffix, "u")
  3422. rv = suffix_replace(rv, suffix, "u")
  3423. elif suffix in ("\xEAncia", "\xEAncias"):
  3424. word = suffix_replace(word, suffix, "ente")
  3425. rv = suffix_replace(rv, suffix, "ente")
  3426. elif suffix == "mente":
  3427. word = word[:-5]
  3428. r2 = r2[:-5]
  3429. rv = rv[:-5]
  3430. if r2.endswith(("ante", "avel", "ivel")):
  3431. word = word[:-4]
  3432. rv = rv[:-4]
  3433. elif suffix in ("idade", "idades"):
  3434. word = word[: -len(suffix)]
  3435. r2 = r2[: -len(suffix)]
  3436. rv = rv[: -len(suffix)]
  3437. if r2.endswith(("ic", "iv")):
  3438. word = word[:-2]
  3439. rv = rv[:-2]
  3440. elif r2.endswith("abil"):
  3441. word = word[:-4]
  3442. rv = rv[:-4]
  3443. elif suffix in ("iva", "ivo", "ivas", "ivos"):
  3444. word = word[: -len(suffix)]
  3445. r2 = r2[: -len(suffix)]
  3446. rv = rv[: -len(suffix)]
  3447. if r2.endswith("at"):
  3448. word = word[:-2]
  3449. rv = rv[:-2]
  3450. else:
  3451. word = word[: -len(suffix)]
  3452. rv = rv[: -len(suffix)]
  3453. break
  3454. # STEP 2: Verb suffixes
  3455. if not step1_success:
  3456. for suffix in self.__step2_suffixes:
  3457. if rv.endswith(suffix):
  3458. step2_success = True
  3459. word = word[: -len(suffix)]
  3460. rv = rv[: -len(suffix)]
  3461. break
  3462. # STEP 3
  3463. if step1_success or step2_success:
  3464. if rv.endswith("i") and word[-2] == "c":
  3465. word = word[:-1]
  3466. rv = rv[:-1]
  3467. ### STEP 4: Residual suffix
  3468. if not step1_success and not step2_success:
  3469. for suffix in self.__step4_suffixes:
  3470. if rv.endswith(suffix):
  3471. word = word[: -len(suffix)]
  3472. rv = rv[: -len(suffix)]
  3473. break
  3474. # STEP 5
  3475. if rv.endswith(("e", "\xE9", "\xEA")):
  3476. word = word[:-1]
  3477. rv = rv[:-1]
  3478. if (word.endswith("gu") and rv.endswith("u")) or (
  3479. word.endswith("ci") and rv.endswith("i")
  3480. ):
  3481. word = word[:-1]
  3482. elif word.endswith("\xE7"):
  3483. word = suffix_replace(word, "\xE7", "c")
  3484. word = word.replace("a~", "\xE3").replace("o~", "\xF5")
  3485. return word
  3486. class RomanianStemmer(_StandardStemmer):
  3487. """
  3488. The Romanian Snowball stemmer.
  3489. :cvar __vowels: The Romanian vowels.
  3490. :type __vowels: unicode
  3491. :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
  3492. :type __step0_suffixes: tuple
  3493. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  3494. :type __step1_suffixes: tuple
  3495. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  3496. :type __step2_suffixes: tuple
  3497. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  3498. :type __step3_suffixes: tuple
  3499. :note: A detailed description of the Romanian
  3500. stemming algorithm can be found under
  3501. http://snowball.tartarus.org/algorithms/romanian/stemmer.html
  3502. """
  3503. __vowels = "aeiou\u0103\xE2\xEE"
  3504. __step0_suffixes = (
  3505. "iilor",
  3506. "ului",
  3507. "elor",
  3508. "iile",
  3509. "ilor",
  3510. "atei",
  3511. "a\u0163ie",
  3512. "a\u0163ia",
  3513. "aua",
  3514. "ele",
  3515. "iua",
  3516. "iei",
  3517. "ile",
  3518. "ul",
  3519. "ea",
  3520. "ii",
  3521. )
  3522. __step1_suffixes = (
  3523. "abilitate",
  3524. "abilitati",
  3525. "abilit\u0103\u0163i",
  3526. "ibilitate",
  3527. "abilit\u0103i",
  3528. "ivitate",
  3529. "ivitati",
  3530. "ivit\u0103\u0163i",
  3531. "icitate",
  3532. "icitati",
  3533. "icit\u0103\u0163i",
  3534. "icatori",
  3535. "ivit\u0103i",
  3536. "icit\u0103i",
  3537. "icator",
  3538. "a\u0163iune",
  3539. "atoare",
  3540. "\u0103toare",
  3541. "i\u0163iune",
  3542. "itoare",
  3543. "iciva",
  3544. "icive",
  3545. "icivi",
  3546. "iciv\u0103",
  3547. "icala",
  3548. "icale",
  3549. "icali",
  3550. "ical\u0103",
  3551. "ativa",
  3552. "ative",
  3553. "ativi",
  3554. "ativ\u0103",
  3555. "atori",
  3556. "\u0103tori",
  3557. "itiva",
  3558. "itive",
  3559. "itivi",
  3560. "itiv\u0103",
  3561. "itori",
  3562. "iciv",
  3563. "ical",
  3564. "ativ",
  3565. "ator",
  3566. "\u0103tor",
  3567. "itiv",
  3568. "itor",
  3569. )
  3570. __step2_suffixes = (
  3571. "abila",
  3572. "abile",
  3573. "abili",
  3574. "abil\u0103",
  3575. "ibila",
  3576. "ibile",
  3577. "ibili",
  3578. "ibil\u0103",
  3579. "atori",
  3580. "itate",
  3581. "itati",
  3582. "it\u0103\u0163i",
  3583. "abil",
  3584. "ibil",
  3585. "oasa",
  3586. "oas\u0103",
  3587. "oase",
  3588. "anta",
  3589. "ante",
  3590. "anti",
  3591. "ant\u0103",
  3592. "ator",
  3593. "it\u0103i",
  3594. "iune",
  3595. "iuni",
  3596. "isme",
  3597. "ista",
  3598. "iste",
  3599. "isti",
  3600. "ist\u0103",
  3601. "i\u015Fti",
  3602. "ata",
  3603. "at\u0103",
  3604. "ati",
  3605. "ate",
  3606. "uta",
  3607. "ut\u0103",
  3608. "uti",
  3609. "ute",
  3610. "ita",
  3611. "it\u0103",
  3612. "iti",
  3613. "ite",
  3614. "ica",
  3615. "ice",
  3616. "ici",
  3617. "ic\u0103",
  3618. "osi",
  3619. "o\u015Fi",
  3620. "ant",
  3621. "iva",
  3622. "ive",
  3623. "ivi",
  3624. "iv\u0103",
  3625. "ism",
  3626. "ist",
  3627. "at",
  3628. "ut",
  3629. "it",
  3630. "ic",
  3631. "os",
  3632. "iv",
  3633. )
  3634. __step3_suffixes = (
  3635. "seser\u0103\u0163i",
  3636. "aser\u0103\u0163i",
  3637. "iser\u0103\u0163i",
  3638. "\xE2ser\u0103\u0163i",
  3639. "user\u0103\u0163i",
  3640. "seser\u0103m",
  3641. "aser\u0103m",
  3642. "iser\u0103m",
  3643. "\xE2ser\u0103m",
  3644. "user\u0103m",
  3645. "ser\u0103\u0163i",
  3646. "sese\u015Fi",
  3647. "seser\u0103",
  3648. "easc\u0103",
  3649. "ar\u0103\u0163i",
  3650. "ur\u0103\u0163i",
  3651. "ir\u0103\u0163i",
  3652. "\xE2r\u0103\u0163i",
  3653. "ase\u015Fi",
  3654. "aser\u0103",
  3655. "ise\u015Fi",
  3656. "iser\u0103",
  3657. "\xe2se\u015Fi",
  3658. "\xE2ser\u0103",
  3659. "use\u015Fi",
  3660. "user\u0103",
  3661. "ser\u0103m",
  3662. "sesem",
  3663. "indu",
  3664. "\xE2ndu",
  3665. "eaz\u0103",
  3666. "e\u015Fti",
  3667. "e\u015Fte",
  3668. "\u0103\u015Fti",
  3669. "\u0103\u015Fte",
  3670. "ea\u0163i",
  3671. "ia\u0163i",
  3672. "ar\u0103m",
  3673. "ur\u0103m",
  3674. "ir\u0103m",
  3675. "\xE2r\u0103m",
  3676. "asem",
  3677. "isem",
  3678. "\xE2sem",
  3679. "usem",
  3680. "se\u015Fi",
  3681. "ser\u0103",
  3682. "sese",
  3683. "are",
  3684. "ere",
  3685. "ire",
  3686. "\xE2re",
  3687. "ind",
  3688. "\xE2nd",
  3689. "eze",
  3690. "ezi",
  3691. "esc",
  3692. "\u0103sc",
  3693. "eam",
  3694. "eai",
  3695. "eau",
  3696. "iam",
  3697. "iai",
  3698. "iau",
  3699. "a\u015Fi",
  3700. "ar\u0103",
  3701. "u\u015Fi",
  3702. "ur\u0103",
  3703. "i\u015Fi",
  3704. "ir\u0103",
  3705. "\xE2\u015Fi",
  3706. "\xe2r\u0103",
  3707. "ase",
  3708. "ise",
  3709. "\xE2se",
  3710. "use",
  3711. "a\u0163i",
  3712. "e\u0163i",
  3713. "i\u0163i",
  3714. "\xe2\u0163i",
  3715. "sei",
  3716. "ez",
  3717. "am",
  3718. "ai",
  3719. "au",
  3720. "ea",
  3721. "ia",
  3722. "ui",
  3723. "\xE2i",
  3724. "\u0103m",
  3725. "em",
  3726. "im",
  3727. "\xE2m",
  3728. "se",
  3729. )
  3730. def stem(self, word):
  3731. """
  3732. Stem a Romanian word and return the stemmed form.
  3733. :param word: The word that is stemmed.
  3734. :type word: str or unicode
  3735. :return: The stemmed form.
  3736. :rtype: unicode
  3737. """
  3738. word = word.lower()
  3739. if word in self.stopwords:
  3740. return word
  3741. step1_success = False
  3742. step2_success = False
  3743. for i in range(1, len(word) - 1):
  3744. if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
  3745. if word[i] == "u":
  3746. word = "".join((word[:i], "U", word[i + 1 :]))
  3747. elif word[i] == "i":
  3748. word = "".join((word[:i], "I", word[i + 1 :]))
  3749. r1, r2 = self._r1r2_standard(word, self.__vowels)
  3750. rv = self._rv_standard(word, self.__vowels)
  3751. # STEP 0: Removal of plurals and other simplifications
  3752. for suffix in self.__step0_suffixes:
  3753. if word.endswith(suffix):
  3754. if suffix in r1:
  3755. if suffix in ("ul", "ului"):
  3756. word = word[: -len(suffix)]
  3757. if suffix in rv:
  3758. rv = rv[: -len(suffix)]
  3759. else:
  3760. rv = ""
  3761. elif (
  3762. suffix == "aua"
  3763. or suffix == "atei"
  3764. or (suffix == "ile" and word[-5:-3] != "ab")
  3765. ):
  3766. word = word[:-2]
  3767. elif suffix in ("ea", "ele", "elor"):
  3768. word = suffix_replace(word, suffix, "e")
  3769. if suffix in rv:
  3770. rv = suffix_replace(rv, suffix, "e")
  3771. else:
  3772. rv = ""
  3773. elif suffix in ("ii", "iua", "iei", "iile", "iilor", "ilor"):
  3774. word = suffix_replace(word, suffix, "i")
  3775. if suffix in rv:
  3776. rv = suffix_replace(rv, suffix, "i")
  3777. else:
  3778. rv = ""
  3779. elif suffix in ("a\u0163ie", "a\u0163ia"):
  3780. word = word[:-1]
  3781. break
  3782. # STEP 1: Reduction of combining suffixes
  3783. while True:
  3784. replacement_done = False
  3785. for suffix in self.__step1_suffixes:
  3786. if word.endswith(suffix):
  3787. if suffix in r1:
  3788. step1_success = True
  3789. replacement_done = True
  3790. if suffix in (
  3791. "abilitate",
  3792. "abilitati",
  3793. "abilit\u0103i",
  3794. "abilit\u0103\u0163i",
  3795. ):
  3796. word = suffix_replace(word, suffix, "abil")
  3797. elif suffix == "ibilitate":
  3798. word = word[:-5]
  3799. elif suffix in (
  3800. "ivitate",
  3801. "ivitati",
  3802. "ivit\u0103i",
  3803. "ivit\u0103\u0163i",
  3804. ):
  3805. word = suffix_replace(word, suffix, "iv")
  3806. elif suffix in (
  3807. "icitate",
  3808. "icitati",
  3809. "icit\u0103i",
  3810. "icit\u0103\u0163i",
  3811. "icator",
  3812. "icatori",
  3813. "iciv",
  3814. "iciva",
  3815. "icive",
  3816. "icivi",
  3817. "iciv\u0103",
  3818. "ical",
  3819. "icala",
  3820. "icale",
  3821. "icali",
  3822. "ical\u0103",
  3823. ):
  3824. word = suffix_replace(word, suffix, "ic")
  3825. elif suffix in (
  3826. "ativ",
  3827. "ativa",
  3828. "ative",
  3829. "ativi",
  3830. "ativ\u0103",
  3831. "a\u0163iune",
  3832. "atoare",
  3833. "ator",
  3834. "atori",
  3835. "\u0103toare",
  3836. "\u0103tor",
  3837. "\u0103tori",
  3838. ):
  3839. word = suffix_replace(word, suffix, "at")
  3840. if suffix in r2:
  3841. r2 = suffix_replace(r2, suffix, "at")
  3842. elif suffix in (
  3843. "itiv",
  3844. "itiva",
  3845. "itive",
  3846. "itivi",
  3847. "itiv\u0103",
  3848. "i\u0163iune",
  3849. "itoare",
  3850. "itor",
  3851. "itori",
  3852. ):
  3853. word = suffix_replace(word, suffix, "it")
  3854. if suffix in r2:
  3855. r2 = suffix_replace(r2, suffix, "it")
  3856. else:
  3857. step1_success = False
  3858. break
  3859. if not replacement_done:
  3860. break
  3861. # STEP 2: Removal of standard suffixes
  3862. for suffix in self.__step2_suffixes:
  3863. if word.endswith(suffix):
  3864. if suffix in r2:
  3865. step2_success = True
  3866. if suffix in ("iune", "iuni"):
  3867. if word[-5] == "\u0163":
  3868. word = "".join((word[:-5], "t"))
  3869. elif suffix in (
  3870. "ism",
  3871. "isme",
  3872. "ist",
  3873. "ista",
  3874. "iste",
  3875. "isti",
  3876. "ist\u0103",
  3877. "i\u015Fti",
  3878. ):
  3879. word = suffix_replace(word, suffix, "ist")
  3880. else:
  3881. word = word[: -len(suffix)]
  3882. break
  3883. # STEP 3: Removal of verb suffixes
  3884. if not step1_success and not step2_success:
  3885. for suffix in self.__step3_suffixes:
  3886. if word.endswith(suffix):
  3887. if suffix in rv:
  3888. if suffix in (
  3889. "seser\u0103\u0163i",
  3890. "seser\u0103m",
  3891. "ser\u0103\u0163i",
  3892. "sese\u015Fi",
  3893. "seser\u0103",
  3894. "ser\u0103m",
  3895. "sesem",
  3896. "se\u015Fi",
  3897. "ser\u0103",
  3898. "sese",
  3899. "a\u0163i",
  3900. "e\u0163i",
  3901. "i\u0163i",
  3902. "\xE2\u0163i",
  3903. "sei",
  3904. "\u0103m",
  3905. "em",
  3906. "im",
  3907. "\xE2m",
  3908. "se",
  3909. ):
  3910. word = word[: -len(suffix)]
  3911. rv = rv[: -len(suffix)]
  3912. else:
  3913. if (
  3914. not rv.startswith(suffix)
  3915. and rv[rv.index(suffix) - 1] not in "aeio\u0103\xE2\xEE"
  3916. ):
  3917. word = word[: -len(suffix)]
  3918. break
  3919. # STEP 4: Removal of final vowel
  3920. for suffix in ("ie", "a", "e", "i", "\u0103"):
  3921. if word.endswith(suffix):
  3922. if suffix in rv:
  3923. word = word[: -len(suffix)]
  3924. break
  3925. word = word.replace("I", "i").replace("U", "u")
  3926. return word
  3927. class RussianStemmer(_LanguageSpecificStemmer):
  3928. """
  3929. The Russian Snowball stemmer.
  3930. :cvar __perfective_gerund_suffixes: Suffixes to be deleted.
  3931. :type __perfective_gerund_suffixes: tuple
  3932. :cvar __adjectival_suffixes: Suffixes to be deleted.
  3933. :type __adjectival_suffixes: tuple
  3934. :cvar __reflexive_suffixes: Suffixes to be deleted.
  3935. :type __reflexive_suffixes: tuple
  3936. :cvar __verb_suffixes: Suffixes to be deleted.
  3937. :type __verb_suffixes: tuple
  3938. :cvar __noun_suffixes: Suffixes to be deleted.
  3939. :type __noun_suffixes: tuple
  3940. :cvar __superlative_suffixes: Suffixes to be deleted.
  3941. :type __superlative_suffixes: tuple
  3942. :cvar __derivational_suffixes: Suffixes to be deleted.
  3943. :type __derivational_suffixes: tuple
  3944. :note: A detailed description of the Russian
  3945. stemming algorithm can be found under
  3946. http://snowball.tartarus.org/algorithms/russian/stemmer.html
  3947. """
  3948. __perfective_gerund_suffixes = (
  3949. "ivshis'",
  3950. "yvshis'",
  3951. "vshis'",
  3952. "ivshi",
  3953. "yvshi",
  3954. "vshi",
  3955. "iv",
  3956. "yv",
  3957. "v",
  3958. )
  3959. __adjectival_suffixes = (
  3960. "ui^ushchi^ui^u",
  3961. "ui^ushchi^ai^a",
  3962. "ui^ushchimi",
  3963. "ui^ushchymi",
  3964. "ui^ushchego",
  3965. "ui^ushchogo",
  3966. "ui^ushchemu",
  3967. "ui^ushchomu",
  3968. "ui^ushchikh",
  3969. "ui^ushchykh",
  3970. "ui^ushchui^u",
  3971. "ui^ushchaia",
  3972. "ui^ushchoi^u",
  3973. "ui^ushchei^u",
  3974. "i^ushchi^ui^u",
  3975. "i^ushchi^ai^a",
  3976. "ui^ushchee",
  3977. "ui^ushchie",
  3978. "ui^ushchye",
  3979. "ui^ushchoe",
  3980. "ui^ushchei`",
  3981. "ui^ushchii`",
  3982. "ui^ushchyi`",
  3983. "ui^ushchoi`",
  3984. "ui^ushchem",
  3985. "ui^ushchim",
  3986. "ui^ushchym",
  3987. "ui^ushchom",
  3988. "i^ushchimi",
  3989. "i^ushchymi",
  3990. "i^ushchego",
  3991. "i^ushchogo",
  3992. "i^ushchemu",
  3993. "i^ushchomu",
  3994. "i^ushchikh",
  3995. "i^ushchykh",
  3996. "i^ushchui^u",
  3997. "i^ushchai^a",
  3998. "i^ushchoi^u",
  3999. "i^ushchei^u",
  4000. "i^ushchee",
  4001. "i^ushchie",
  4002. "i^ushchye",
  4003. "i^ushchoe",
  4004. "i^ushchei`",
  4005. "i^ushchii`",
  4006. "i^ushchyi`",
  4007. "i^ushchoi`",
  4008. "i^ushchem",
  4009. "i^ushchim",
  4010. "i^ushchym",
  4011. "i^ushchom",
  4012. "shchi^ui^u",
  4013. "shchi^ai^a",
  4014. "ivshi^ui^u",
  4015. "ivshi^ai^a",
  4016. "yvshi^ui^u",
  4017. "yvshi^ai^a",
  4018. "shchimi",
  4019. "shchymi",
  4020. "shchego",
  4021. "shchogo",
  4022. "shchemu",
  4023. "shchomu",
  4024. "shchikh",
  4025. "shchykh",
  4026. "shchui^u",
  4027. "shchai^a",
  4028. "shchoi^u",
  4029. "shchei^u",
  4030. "ivshimi",
  4031. "ivshymi",
  4032. "ivshego",
  4033. "ivshogo",
  4034. "ivshemu",
  4035. "ivshomu",
  4036. "ivshikh",
  4037. "ivshykh",
  4038. "ivshui^u",
  4039. "ivshai^a",
  4040. "ivshoi^u",
  4041. "ivshei^u",
  4042. "yvshimi",
  4043. "yvshymi",
  4044. "yvshego",
  4045. "yvshogo",
  4046. "yvshemu",
  4047. "yvshomu",
  4048. "yvshikh",
  4049. "yvshykh",
  4050. "yvshui^u",
  4051. "yvshai^a",
  4052. "yvshoi^u",
  4053. "yvshei^u",
  4054. "vshi^ui^u",
  4055. "vshi^ai^a",
  4056. "shchee",
  4057. "shchie",
  4058. "shchye",
  4059. "shchoe",
  4060. "shchei`",
  4061. "shchii`",
  4062. "shchyi`",
  4063. "shchoi`",
  4064. "shchem",
  4065. "shchim",
  4066. "shchym",
  4067. "shchom",
  4068. "ivshee",
  4069. "ivshie",
  4070. "ivshye",
  4071. "ivshoe",
  4072. "ivshei`",
  4073. "ivshii`",
  4074. "ivshyi`",
  4075. "ivshoi`",
  4076. "ivshem",
  4077. "ivshim",
  4078. "ivshym",
  4079. "ivshom",
  4080. "yvshee",
  4081. "yvshie",
  4082. "yvshye",
  4083. "yvshoe",
  4084. "yvshei`",
  4085. "yvshii`",
  4086. "yvshyi`",
  4087. "yvshoi`",
  4088. "yvshem",
  4089. "yvshim",
  4090. "yvshym",
  4091. "yvshom",
  4092. "vshimi",
  4093. "vshymi",
  4094. "vshego",
  4095. "vshogo",
  4096. "vshemu",
  4097. "vshomu",
  4098. "vshikh",
  4099. "vshykh",
  4100. "vshui^u",
  4101. "vshai^a",
  4102. "vshoi^u",
  4103. "vshei^u",
  4104. "emi^ui^u",
  4105. "emi^ai^a",
  4106. "nni^ui^u",
  4107. "nni^ai^a",
  4108. "vshee",
  4109. "vshie",
  4110. "vshye",
  4111. "vshoe",
  4112. "vshei`",
  4113. "vshii`",
  4114. "vshyi`",
  4115. "vshoi`",
  4116. "vshem",
  4117. "vshim",
  4118. "vshym",
  4119. "vshom",
  4120. "emimi",
  4121. "emymi",
  4122. "emego",
  4123. "emogo",
  4124. "ememu",
  4125. "emomu",
  4126. "emikh",
  4127. "emykh",
  4128. "emui^u",
  4129. "emai^a",
  4130. "emoi^u",
  4131. "emei^u",
  4132. "nnimi",
  4133. "nnymi",
  4134. "nnego",
  4135. "nnogo",
  4136. "nnemu",
  4137. "nnomu",
  4138. "nnikh",
  4139. "nnykh",
  4140. "nnui^u",
  4141. "nnai^a",
  4142. "nnoi^u",
  4143. "nnei^u",
  4144. "emee",
  4145. "emie",
  4146. "emye",
  4147. "emoe",
  4148. "emei`",
  4149. "emii`",
  4150. "emyi`",
  4151. "emoi`",
  4152. "emem",
  4153. "emim",
  4154. "emym",
  4155. "emom",
  4156. "nnee",
  4157. "nnie",
  4158. "nnye",
  4159. "nnoe",
  4160. "nnei`",
  4161. "nnii`",
  4162. "nnyi`",
  4163. "nnoi`",
  4164. "nnem",
  4165. "nnim",
  4166. "nnym",
  4167. "nnom",
  4168. "i^ui^u",
  4169. "i^ai^a",
  4170. "imi",
  4171. "ymi",
  4172. "ego",
  4173. "ogo",
  4174. "emu",
  4175. "omu",
  4176. "ikh",
  4177. "ykh",
  4178. "ui^u",
  4179. "ai^a",
  4180. "oi^u",
  4181. "ei^u",
  4182. "ee",
  4183. "ie",
  4184. "ye",
  4185. "oe",
  4186. "ei`",
  4187. "ii`",
  4188. "yi`",
  4189. "oi`",
  4190. "em",
  4191. "im",
  4192. "ym",
  4193. "om",
  4194. )
  4195. __reflexive_suffixes = ("si^a", "s'")
  4196. __verb_suffixes = (
  4197. "esh'",
  4198. "ei`te",
  4199. "ui`te",
  4200. "ui^ut",
  4201. "ish'",
  4202. "ete",
  4203. "i`te",
  4204. "i^ut",
  4205. "nno",
  4206. "ila",
  4207. "yla",
  4208. "ena",
  4209. "ite",
  4210. "ili",
  4211. "yli",
  4212. "ilo",
  4213. "ylo",
  4214. "eno",
  4215. "i^at",
  4216. "uet",
  4217. "eny",
  4218. "it'",
  4219. "yt'",
  4220. "ui^u",
  4221. "la",
  4222. "na",
  4223. "li",
  4224. "em",
  4225. "lo",
  4226. "no",
  4227. "et",
  4228. "ny",
  4229. "t'",
  4230. "ei`",
  4231. "ui`",
  4232. "il",
  4233. "yl",
  4234. "im",
  4235. "ym",
  4236. "en",
  4237. "it",
  4238. "yt",
  4239. "i^u",
  4240. "i`",
  4241. "l",
  4242. "n",
  4243. )
  4244. __noun_suffixes = (
  4245. "ii^ami",
  4246. "ii^akh",
  4247. "i^ami",
  4248. "ii^am",
  4249. "i^akh",
  4250. "ami",
  4251. "iei`",
  4252. "i^am",
  4253. "iem",
  4254. "akh",
  4255. "ii^u",
  4256. "'i^u",
  4257. "ii^a",
  4258. "'i^a",
  4259. "ev",
  4260. "ov",
  4261. "ie",
  4262. "'e",
  4263. "ei",
  4264. "ii",
  4265. "ei`",
  4266. "oi`",
  4267. "ii`",
  4268. "em",
  4269. "am",
  4270. "om",
  4271. "i^u",
  4272. "i^a",
  4273. "a",
  4274. "e",
  4275. "i",
  4276. "i`",
  4277. "o",
  4278. "u",
  4279. "y",
  4280. "'",
  4281. )
  4282. __superlative_suffixes = ("ei`she", "ei`sh")
  4283. __derivational_suffixes = ("ost'", "ost")
  4284. def stem(self, word):
  4285. """
  4286. Stem a Russian word and return the stemmed form.
  4287. :param word: The word that is stemmed.
  4288. :type word: str or unicode
  4289. :return: The stemmed form.
  4290. :rtype: unicode
  4291. """
  4292. if word in self.stopwords:
  4293. return word
  4294. chr_exceeded = False
  4295. for i in range(len(word)):
  4296. if ord(word[i]) > 255:
  4297. chr_exceeded = True
  4298. break
  4299. if not chr_exceeded:
  4300. return word
  4301. word = self.__cyrillic_to_roman(word)
  4302. step1_success = False
  4303. adjectival_removed = False
  4304. verb_removed = False
  4305. undouble_success = False
  4306. superlative_removed = False
  4307. rv, r2 = self.__regions_russian(word)
  4308. # Step 1
  4309. for suffix in self.__perfective_gerund_suffixes:
  4310. if rv.endswith(suffix):
  4311. if suffix in ("v", "vshi", "vshis'"):
  4312. if (
  4313. rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
  4314. or rv[-len(suffix) - 1 : -len(suffix)] == "a"
  4315. ):
  4316. word = word[: -len(suffix)]
  4317. r2 = r2[: -len(suffix)]
  4318. rv = rv[: -len(suffix)]
  4319. step1_success = True
  4320. break
  4321. else:
  4322. word = word[: -len(suffix)]
  4323. r2 = r2[: -len(suffix)]
  4324. rv = rv[: -len(suffix)]
  4325. step1_success = True
  4326. break
  4327. if not step1_success:
  4328. for suffix in self.__reflexive_suffixes:
  4329. if rv.endswith(suffix):
  4330. word = word[: -len(suffix)]
  4331. r2 = r2[: -len(suffix)]
  4332. rv = rv[: -len(suffix)]
  4333. break
  4334. for suffix in self.__adjectival_suffixes:
  4335. if rv.endswith(suffix):
  4336. if suffix in (
  4337. "i^ushchi^ui^u",
  4338. "i^ushchi^ai^a",
  4339. "i^ushchui^u",
  4340. "i^ushchai^a",
  4341. "i^ushchoi^u",
  4342. "i^ushchei^u",
  4343. "i^ushchimi",
  4344. "i^ushchymi",
  4345. "i^ushchego",
  4346. "i^ushchogo",
  4347. "i^ushchemu",
  4348. "i^ushchomu",
  4349. "i^ushchikh",
  4350. "i^ushchykh",
  4351. "shchi^ui^u",
  4352. "shchi^ai^a",
  4353. "i^ushchee",
  4354. "i^ushchie",
  4355. "i^ushchye",
  4356. "i^ushchoe",
  4357. "i^ushchei`",
  4358. "i^ushchii`",
  4359. "i^ushchyi`",
  4360. "i^ushchoi`",
  4361. "i^ushchem",
  4362. "i^ushchim",
  4363. "i^ushchym",
  4364. "i^ushchom",
  4365. "vshi^ui^u",
  4366. "vshi^ai^a",
  4367. "shchui^u",
  4368. "shchai^a",
  4369. "shchoi^u",
  4370. "shchei^u",
  4371. "emi^ui^u",
  4372. "emi^ai^a",
  4373. "nni^ui^u",
  4374. "nni^ai^a",
  4375. "shchimi",
  4376. "shchymi",
  4377. "shchego",
  4378. "shchogo",
  4379. "shchemu",
  4380. "shchomu",
  4381. "shchikh",
  4382. "shchykh",
  4383. "vshui^u",
  4384. "vshai^a",
  4385. "vshoi^u",
  4386. "vshei^u",
  4387. "shchee",
  4388. "shchie",
  4389. "shchye",
  4390. "shchoe",
  4391. "shchei`",
  4392. "shchii`",
  4393. "shchyi`",
  4394. "shchoi`",
  4395. "shchem",
  4396. "shchim",
  4397. "shchym",
  4398. "shchom",
  4399. "vshimi",
  4400. "vshymi",
  4401. "vshego",
  4402. "vshogo",
  4403. "vshemu",
  4404. "vshomu",
  4405. "vshikh",
  4406. "vshykh",
  4407. "emui^u",
  4408. "emai^a",
  4409. "emoi^u",
  4410. "emei^u",
  4411. "nnui^u",
  4412. "nnai^a",
  4413. "nnoi^u",
  4414. "nnei^u",
  4415. "vshee",
  4416. "vshie",
  4417. "vshye",
  4418. "vshoe",
  4419. "vshei`",
  4420. "vshii`",
  4421. "vshyi`",
  4422. "vshoi`",
  4423. "vshem",
  4424. "vshim",
  4425. "vshym",
  4426. "vshom",
  4427. "emimi",
  4428. "emymi",
  4429. "emego",
  4430. "emogo",
  4431. "ememu",
  4432. "emomu",
  4433. "emikh",
  4434. "emykh",
  4435. "nnimi",
  4436. "nnymi",
  4437. "nnego",
  4438. "nnogo",
  4439. "nnemu",
  4440. "nnomu",
  4441. "nnikh",
  4442. "nnykh",
  4443. "emee",
  4444. "emie",
  4445. "emye",
  4446. "emoe",
  4447. "emei`",
  4448. "emii`",
  4449. "emyi`",
  4450. "emoi`",
  4451. "emem",
  4452. "emim",
  4453. "emym",
  4454. "emom",
  4455. "nnee",
  4456. "nnie",
  4457. "nnye",
  4458. "nnoe",
  4459. "nnei`",
  4460. "nnii`",
  4461. "nnyi`",
  4462. "nnoi`",
  4463. "nnem",
  4464. "nnim",
  4465. "nnym",
  4466. "nnom",
  4467. ):
  4468. if (
  4469. rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
  4470. or rv[-len(suffix) - 1 : -len(suffix)] == "a"
  4471. ):
  4472. word = word[: -len(suffix)]
  4473. r2 = r2[: -len(suffix)]
  4474. rv = rv[: -len(suffix)]
  4475. adjectival_removed = True
  4476. break
  4477. else:
  4478. word = word[: -len(suffix)]
  4479. r2 = r2[: -len(suffix)]
  4480. rv = rv[: -len(suffix)]
  4481. adjectival_removed = True
  4482. break
  4483. if not adjectival_removed:
  4484. for suffix in self.__verb_suffixes:
  4485. if rv.endswith(suffix):
  4486. if suffix in (
  4487. "la",
  4488. "na",
  4489. "ete",
  4490. "i`te",
  4491. "li",
  4492. "i`",
  4493. "l",
  4494. "em",
  4495. "n",
  4496. "lo",
  4497. "no",
  4498. "et",
  4499. "i^ut",
  4500. "ny",
  4501. "t'",
  4502. "esh'",
  4503. "nno",
  4504. ):
  4505. if (
  4506. rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
  4507. or rv[-len(suffix) - 1 : -len(suffix)] == "a"
  4508. ):
  4509. word = word[: -len(suffix)]
  4510. r2 = r2[: -len(suffix)]
  4511. rv = rv[: -len(suffix)]
  4512. verb_removed = True
  4513. break
  4514. else:
  4515. word = word[: -len(suffix)]
  4516. r2 = r2[: -len(suffix)]
  4517. rv = rv[: -len(suffix)]
  4518. verb_removed = True
  4519. break
  4520. if not adjectival_removed and not verb_removed:
  4521. for suffix in self.__noun_suffixes:
  4522. if rv.endswith(suffix):
  4523. word = word[: -len(suffix)]
  4524. r2 = r2[: -len(suffix)]
  4525. rv = rv[: -len(suffix)]
  4526. break
  4527. # Step 2
  4528. if rv.endswith("i"):
  4529. word = word[:-1]
  4530. r2 = r2[:-1]
  4531. # Step 3
  4532. for suffix in self.__derivational_suffixes:
  4533. if r2.endswith(suffix):
  4534. word = word[: -len(suffix)]
  4535. break
  4536. # Step 4
  4537. if word.endswith("nn"):
  4538. word = word[:-1]
  4539. undouble_success = True
  4540. if not undouble_success:
  4541. for suffix in self.__superlative_suffixes:
  4542. if word.endswith(suffix):
  4543. word = word[: -len(suffix)]
  4544. superlative_removed = True
  4545. break
  4546. if word.endswith("nn"):
  4547. word = word[:-1]
  4548. if not undouble_success and not superlative_removed:
  4549. if word.endswith("'"):
  4550. word = word[:-1]
  4551. word = self.__roman_to_cyrillic(word)
  4552. return word
  4553. def __regions_russian(self, word):
  4554. """
  4555. Return the regions RV and R2 which are used by the Russian stemmer.
  4556. In any word, RV is the region after the first vowel,
  4557. or the end of the word if it contains no vowel.
  4558. R2 is the region after the first non-vowel following
  4559. a vowel in R1, or the end of the word if there is no such non-vowel.
  4560. R1 is the region after the first non-vowel following a vowel,
  4561. or the end of the word if there is no such non-vowel.
  4562. :param word: The Russian word whose regions RV and R2 are determined.
  4563. :type word: str or unicode
  4564. :return: the regions RV and R2 for the respective Russian word.
  4565. :rtype: tuple
  4566. :note: This helper method is invoked by the stem method of the subclass
  4567. RussianStemmer. It is not to be invoked directly!
  4568. """
  4569. r1 = ""
  4570. r2 = ""
  4571. rv = ""
  4572. vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y")
  4573. word = word.replace("i^a", "A").replace("i^u", "U").replace("e`", "E")
  4574. for i in range(1, len(word)):
  4575. if word[i] not in vowels and word[i - 1] in vowels:
  4576. r1 = word[i + 1 :]
  4577. break
  4578. for i in range(1, len(r1)):
  4579. if r1[i] not in vowels and r1[i - 1] in vowels:
  4580. r2 = r1[i + 1 :]
  4581. break
  4582. for i in range(len(word)):
  4583. if word[i] in vowels:
  4584. rv = word[i + 1 :]
  4585. break
  4586. r2 = r2.replace("A", "i^a").replace("U", "i^u").replace("E", "e`")
  4587. rv = rv.replace("A", "i^a").replace("U", "i^u").replace("E", "e`")
  4588. return (rv, r2)
  4589. def __cyrillic_to_roman(self, word):
  4590. """
  4591. Transliterate a Russian word into the Roman alphabet.
  4592. A Russian word whose letters consist of the Cyrillic
  4593. alphabet are transliterated into the Roman alphabet
  4594. in order to ease the forthcoming stemming process.
  4595. :param word: The word that is transliterated.
  4596. :type word: unicode
  4597. :return: the transliterated word.
  4598. :rtype: unicode
  4599. :note: This helper method is invoked by the stem method of the subclass
  4600. RussianStemmer. It is not to be invoked directly!
  4601. """
  4602. word = (
  4603. word.replace("\u0410", "a")
  4604. .replace("\u0430", "a")
  4605. .replace("\u0411", "b")
  4606. .replace("\u0431", "b")
  4607. .replace("\u0412", "v")
  4608. .replace("\u0432", "v")
  4609. .replace("\u0413", "g")
  4610. .replace("\u0433", "g")
  4611. .replace("\u0414", "d")
  4612. .replace("\u0434", "d")
  4613. .replace("\u0415", "e")
  4614. .replace("\u0435", "e")
  4615. .replace("\u0401", "e")
  4616. .replace("\u0451", "e")
  4617. .replace("\u0416", "zh")
  4618. .replace("\u0436", "zh")
  4619. .replace("\u0417", "z")
  4620. .replace("\u0437", "z")
  4621. .replace("\u0418", "i")
  4622. .replace("\u0438", "i")
  4623. .replace("\u0419", "i`")
  4624. .replace("\u0439", "i`")
  4625. .replace("\u041A", "k")
  4626. .replace("\u043A", "k")
  4627. .replace("\u041B", "l")
  4628. .replace("\u043B", "l")
  4629. .replace("\u041C", "m")
  4630. .replace("\u043C", "m")
  4631. .replace("\u041D", "n")
  4632. .replace("\u043D", "n")
  4633. .replace("\u041E", "o")
  4634. .replace("\u043E", "o")
  4635. .replace("\u041F", "p")
  4636. .replace("\u043F", "p")
  4637. .replace("\u0420", "r")
  4638. .replace("\u0440", "r")
  4639. .replace("\u0421", "s")
  4640. .replace("\u0441", "s")
  4641. .replace("\u0422", "t")
  4642. .replace("\u0442", "t")
  4643. .replace("\u0423", "u")
  4644. .replace("\u0443", "u")
  4645. .replace("\u0424", "f")
  4646. .replace("\u0444", "f")
  4647. .replace("\u0425", "kh")
  4648. .replace("\u0445", "kh")
  4649. .replace("\u0426", "t^s")
  4650. .replace("\u0446", "t^s")
  4651. .replace("\u0427", "ch")
  4652. .replace("\u0447", "ch")
  4653. .replace("\u0428", "sh")
  4654. .replace("\u0448", "sh")
  4655. .replace("\u0429", "shch")
  4656. .replace("\u0449", "shch")
  4657. .replace("\u042A", "''")
  4658. .replace("\u044A", "''")
  4659. .replace("\u042B", "y")
  4660. .replace("\u044B", "y")
  4661. .replace("\u042C", "'")
  4662. .replace("\u044C", "'")
  4663. .replace("\u042D", "e`")
  4664. .replace("\u044D", "e`")
  4665. .replace("\u042E", "i^u")
  4666. .replace("\u044E", "i^u")
  4667. .replace("\u042F", "i^a")
  4668. .replace("\u044F", "i^a")
  4669. )
  4670. return word
  4671. def __roman_to_cyrillic(self, word):
  4672. """
  4673. Transliterate a Russian word back into the Cyrillic alphabet.
  4674. A Russian word formerly transliterated into the Roman alphabet
  4675. in order to ease the stemming process, is transliterated back
  4676. into the Cyrillic alphabet, its original form.
  4677. :param word: The word that is transliterated.
  4678. :type word: str or unicode
  4679. :return: word, the transliterated word.
  4680. :rtype: unicode
  4681. :note: This helper method is invoked by the stem method of the subclass
  4682. RussianStemmer. It is not to be invoked directly!
  4683. """
  4684. word = (
  4685. word.replace("i^u", "\u044E")
  4686. .replace("i^a", "\u044F")
  4687. .replace("shch", "\u0449")
  4688. .replace("kh", "\u0445")
  4689. .replace("t^s", "\u0446")
  4690. .replace("ch", "\u0447")
  4691. .replace("e`", "\u044D")
  4692. .replace("i`", "\u0439")
  4693. .replace("sh", "\u0448")
  4694. .replace("k", "\u043A")
  4695. .replace("e", "\u0435")
  4696. .replace("zh", "\u0436")
  4697. .replace("a", "\u0430")
  4698. .replace("b", "\u0431")
  4699. .replace("v", "\u0432")
  4700. .replace("g", "\u0433")
  4701. .replace("d", "\u0434")
  4702. .replace("e", "\u0435")
  4703. .replace("z", "\u0437")
  4704. .replace("i", "\u0438")
  4705. .replace("l", "\u043B")
  4706. .replace("m", "\u043C")
  4707. .replace("n", "\u043D")
  4708. .replace("o", "\u043E")
  4709. .replace("p", "\u043F")
  4710. .replace("r", "\u0440")
  4711. .replace("s", "\u0441")
  4712. .replace("t", "\u0442")
  4713. .replace("u", "\u0443")
  4714. .replace("f", "\u0444")
  4715. .replace("''", "\u044A")
  4716. .replace("y", "\u044B")
  4717. .replace("'", "\u044C")
  4718. )
  4719. return word
  4720. class SpanishStemmer(_StandardStemmer):
  4721. """
  4722. The Spanish Snowball stemmer.
  4723. :cvar __vowels: The Spanish vowels.
  4724. :type __vowels: unicode
  4725. :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
  4726. :type __step0_suffixes: tuple
  4727. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  4728. :type __step1_suffixes: tuple
  4729. :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
  4730. :type __step2a_suffixes: tuple
  4731. :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
  4732. :type __step2b_suffixes: tuple
  4733. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  4734. :type __step3_suffixes: tuple
  4735. :note: A detailed description of the Spanish
  4736. stemming algorithm can be found under
  4737. http://snowball.tartarus.org/algorithms/spanish/stemmer.html
  4738. """
  4739. __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xFC"
  4740. __step0_suffixes = (
  4741. "selas",
  4742. "selos",
  4743. "sela",
  4744. "selo",
  4745. "las",
  4746. "les",
  4747. "los",
  4748. "nos",
  4749. "me",
  4750. "se",
  4751. "la",
  4752. "le",
  4753. "lo",
  4754. )
  4755. __step1_suffixes = (
  4756. "amientos",
  4757. "imientos",
  4758. "amiento",
  4759. "imiento",
  4760. "aciones",
  4761. "uciones",
  4762. "adoras",
  4763. "adores",
  4764. "ancias",
  4765. "log\xEDas",
  4766. "encias",
  4767. "amente",
  4768. "idades",
  4769. "anzas",
  4770. "ismos",
  4771. "ables",
  4772. "ibles",
  4773. "istas",
  4774. "adora",
  4775. "aci\xF3n",
  4776. "antes",
  4777. "ancia",
  4778. "log\xEDa",
  4779. "uci\xf3n",
  4780. "encia",
  4781. "mente",
  4782. "anza",
  4783. "icos",
  4784. "icas",
  4785. "ismo",
  4786. "able",
  4787. "ible",
  4788. "ista",
  4789. "osos",
  4790. "osas",
  4791. "ador",
  4792. "ante",
  4793. "idad",
  4794. "ivas",
  4795. "ivos",
  4796. "ico",
  4797. "ica",
  4798. "oso",
  4799. "osa",
  4800. "iva",
  4801. "ivo",
  4802. )
  4803. __step2a_suffixes = (
  4804. "yeron",
  4805. "yendo",
  4806. "yamos",
  4807. "yais",
  4808. "yan",
  4809. "yen",
  4810. "yas",
  4811. "yes",
  4812. "ya",
  4813. "ye",
  4814. "yo",
  4815. "y\xF3",
  4816. )
  4817. __step2b_suffixes = (
  4818. "ar\xEDamos",
  4819. "er\xEDamos",
  4820. "ir\xEDamos",
  4821. "i\xE9ramos",
  4822. "i\xE9semos",
  4823. "ar\xEDais",
  4824. "aremos",
  4825. "er\xEDais",
  4826. "eremos",
  4827. "ir\xEDais",
  4828. "iremos",
  4829. "ierais",
  4830. "ieseis",
  4831. "asteis",
  4832. "isteis",
  4833. "\xE1bamos",
  4834. "\xE1ramos",
  4835. "\xE1semos",
  4836. "ar\xEDan",
  4837. "ar\xEDas",
  4838. "ar\xE9is",
  4839. "er\xEDan",
  4840. "er\xEDas",
  4841. "er\xE9is",
  4842. "ir\xEDan",
  4843. "ir\xEDas",
  4844. "ir\xE9is",
  4845. "ieran",
  4846. "iesen",
  4847. "ieron",
  4848. "iendo",
  4849. "ieras",
  4850. "ieses",
  4851. "abais",
  4852. "arais",
  4853. "aseis",
  4854. "\xE9amos",
  4855. "ar\xE1n",
  4856. "ar\xE1s",
  4857. "ar\xEDa",
  4858. "er\xE1n",
  4859. "er\xE1s",
  4860. "er\xEDa",
  4861. "ir\xE1n",
  4862. "ir\xE1s",
  4863. "ir\xEDa",
  4864. "iera",
  4865. "iese",
  4866. "aste",
  4867. "iste",
  4868. "aban",
  4869. "aran",
  4870. "asen",
  4871. "aron",
  4872. "ando",
  4873. "abas",
  4874. "adas",
  4875. "idas",
  4876. "aras",
  4877. "ases",
  4878. "\xEDais",
  4879. "ados",
  4880. "idos",
  4881. "amos",
  4882. "imos",
  4883. "emos",
  4884. "ar\xE1",
  4885. "ar\xE9",
  4886. "er\xE1",
  4887. "er\xE9",
  4888. "ir\xE1",
  4889. "ir\xE9",
  4890. "aba",
  4891. "ada",
  4892. "ida",
  4893. "ara",
  4894. "ase",
  4895. "\xEDan",
  4896. "ado",
  4897. "ido",
  4898. "\xEDas",
  4899. "\xE1is",
  4900. "\xE9is",
  4901. "\xEDa",
  4902. "ad",
  4903. "ed",
  4904. "id",
  4905. "an",
  4906. "i\xF3",
  4907. "ar",
  4908. "er",
  4909. "ir",
  4910. "as",
  4911. "\xEDs",
  4912. "en",
  4913. "es",
  4914. )
  4915. __step3_suffixes = ("os", "a", "e", "o", "\xE1", "\xE9", "\xED", "\xF3")
  4916. def stem(self, word):
  4917. """
  4918. Stem a Spanish word and return the stemmed form.
  4919. :param word: The word that is stemmed.
  4920. :type word: str or unicode
  4921. :return: The stemmed form.
  4922. :rtype: unicode
  4923. """
  4924. word = word.lower()
  4925. if word in self.stopwords:
  4926. return word
  4927. step1_success = False
  4928. r1, r2 = self._r1r2_standard(word, self.__vowels)
  4929. rv = self._rv_standard(word, self.__vowels)
  4930. # STEP 0: Attached pronoun
  4931. for suffix in self.__step0_suffixes:
  4932. if not (word.endswith(suffix) and rv.endswith(suffix)):
  4933. continue
  4934. if (
  4935. rv[: -len(suffix)].endswith(
  4936. (
  4937. "ando",
  4938. "\xE1ndo",
  4939. "ar",
  4940. "\xE1r",
  4941. "er",
  4942. "\xE9r",
  4943. "iendo",
  4944. "i\xE9ndo",
  4945. "ir",
  4946. "\xEDr",
  4947. )
  4948. )
  4949. ) or (
  4950. rv[: -len(suffix)].endswith("yendo")
  4951. and word[: -len(suffix)].endswith("uyendo")
  4952. ):
  4953. word = self.__replace_accented(word[: -len(suffix)])
  4954. r1 = self.__replace_accented(r1[: -len(suffix)])
  4955. r2 = self.__replace_accented(r2[: -len(suffix)])
  4956. rv = self.__replace_accented(rv[: -len(suffix)])
  4957. break
  4958. # STEP 1: Standard suffix removal
  4959. for suffix in self.__step1_suffixes:
  4960. if not word.endswith(suffix):
  4961. continue
  4962. if suffix == "amente" and r1.endswith(suffix):
  4963. step1_success = True
  4964. word = word[:-6]
  4965. r2 = r2[:-6]
  4966. rv = rv[:-6]
  4967. if r2.endswith("iv"):
  4968. word = word[:-2]
  4969. r2 = r2[:-2]
  4970. rv = rv[:-2]
  4971. if r2.endswith("at"):
  4972. word = word[:-2]
  4973. rv = rv[:-2]
  4974. elif r2.endswith(("os", "ic", "ad")):
  4975. word = word[:-2]
  4976. rv = rv[:-2]
  4977. elif r2.endswith(suffix):
  4978. step1_success = True
  4979. if suffix in (
  4980. "adora",
  4981. "ador",
  4982. "aci\xF3n",
  4983. "adoras",
  4984. "adores",
  4985. "aciones",
  4986. "ante",
  4987. "antes",
  4988. "ancia",
  4989. "ancias",
  4990. ):
  4991. word = word[: -len(suffix)]
  4992. r2 = r2[: -len(suffix)]
  4993. rv = rv[: -len(suffix)]
  4994. if r2.endswith("ic"):
  4995. word = word[:-2]
  4996. rv = rv[:-2]
  4997. elif suffix in ("log\xEDa", "log\xEDas"):
  4998. word = suffix_replace(word, suffix, "log")
  4999. rv = suffix_replace(rv, suffix, "log")
  5000. elif suffix in ("uci\xF3n", "uciones"):
  5001. word = suffix_replace(word, suffix, "u")
  5002. rv = suffix_replace(rv, suffix, "u")
  5003. elif suffix in ("encia", "encias"):
  5004. word = suffix_replace(word, suffix, "ente")
  5005. rv = suffix_replace(rv, suffix, "ente")
  5006. elif suffix == "mente":
  5007. word = word[: -len(suffix)]
  5008. r2 = r2[: -len(suffix)]
  5009. rv = rv[: -len(suffix)]
  5010. if r2.endswith(("ante", "able", "ible")):
  5011. word = word[:-4]
  5012. rv = rv[:-4]
  5013. elif suffix in ("idad", "idades"):
  5014. word = word[: -len(suffix)]
  5015. r2 = r2[: -len(suffix)]
  5016. rv = rv[: -len(suffix)]
  5017. for pre_suff in ("abil", "ic", "iv"):
  5018. if r2.endswith(pre_suff):
  5019. word = word[: -len(pre_suff)]
  5020. rv = rv[: -len(pre_suff)]
  5021. elif suffix in ("ivo", "iva", "ivos", "ivas"):
  5022. word = word[: -len(suffix)]
  5023. r2 = r2[: -len(suffix)]
  5024. rv = rv[: -len(suffix)]
  5025. if r2.endswith("at"):
  5026. word = word[:-2]
  5027. rv = rv[:-2]
  5028. else:
  5029. word = word[: -len(suffix)]
  5030. rv = rv[: -len(suffix)]
  5031. break
  5032. # STEP 2a: Verb suffixes beginning 'y'
  5033. if not step1_success:
  5034. for suffix in self.__step2a_suffixes:
  5035. if rv.endswith(suffix) and word[-len(suffix) - 1 : -len(suffix)] == "u":
  5036. word = word[: -len(suffix)]
  5037. rv = rv[: -len(suffix)]
  5038. break
  5039. # STEP 2b: Other verb suffixes
  5040. for suffix in self.__step2b_suffixes:
  5041. if rv.endswith(suffix):
  5042. word = word[: -len(suffix)]
  5043. rv = rv[: -len(suffix)]
  5044. if suffix in ("en", "es", "\xE9is", "emos"):
  5045. if word.endswith("gu"):
  5046. word = word[:-1]
  5047. if rv.endswith("gu"):
  5048. rv = rv[:-1]
  5049. break
  5050. # STEP 3: Residual suffix
  5051. for suffix in self.__step3_suffixes:
  5052. if rv.endswith(suffix):
  5053. word = word[: -len(suffix)]
  5054. if suffix in ("e", "\xE9"):
  5055. rv = rv[: -len(suffix)]
  5056. if word[-2:] == "gu" and rv.endswith("u"):
  5057. word = word[:-1]
  5058. break
  5059. word = self.__replace_accented(word)
  5060. return word
  5061. def __replace_accented(self, word):
  5062. """
  5063. Replaces all accented letters on a word with their non-accented
  5064. counterparts.
  5065. :param word: A spanish word, with or without accents
  5066. :type word: str or unicode
  5067. :return: a word with the accented letters (á, é, í, ó, ú) replaced with
  5068. their non-accented counterparts (a, e, i, o, u)
  5069. :rtype: str or unicode
  5070. """
  5071. return (
  5072. word.replace("\xE1", "a")
  5073. .replace("\xE9", "e")
  5074. .replace("\xED", "i")
  5075. .replace("\xF3", "o")
  5076. .replace("\xFA", "u")
  5077. )
  5078. class SwedishStemmer(_ScandinavianStemmer):
  5079. """
  5080. The Swedish Snowball stemmer.
  5081. :cvar __vowels: The Swedish vowels.
  5082. :type __vowels: unicode
  5083. :cvar __s_ending: Letters that may directly appear before a word final 's'.
  5084. :type __s_ending: unicode
  5085. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  5086. :type __step1_suffixes: tuple
  5087. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  5088. :type __step2_suffixes: tuple
  5089. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  5090. :type __step3_suffixes: tuple
  5091. :note: A detailed description of the Swedish
  5092. stemming algorithm can be found under
  5093. http://snowball.tartarus.org/algorithms/swedish/stemmer.html
  5094. """
  5095. __vowels = "aeiouy\xE4\xE5\xF6"
  5096. __s_ending = "bcdfghjklmnoprtvy"
  5097. __step1_suffixes = (
  5098. "heterna",
  5099. "hetens",
  5100. "heter",
  5101. "heten",
  5102. "anden",
  5103. "arnas",
  5104. "ernas",
  5105. "ornas",
  5106. "andes",
  5107. "andet",
  5108. "arens",
  5109. "arna",
  5110. "erna",
  5111. "orna",
  5112. "ande",
  5113. "arne",
  5114. "aste",
  5115. "aren",
  5116. "ades",
  5117. "erns",
  5118. "ade",
  5119. "are",
  5120. "ern",
  5121. "ens",
  5122. "het",
  5123. "ast",
  5124. "ad",
  5125. "en",
  5126. "ar",
  5127. "er",
  5128. "or",
  5129. "as",
  5130. "es",
  5131. "at",
  5132. "a",
  5133. "e",
  5134. "s",
  5135. )
  5136. __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt")
  5137. __step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig")
  5138. def stem(self, word):
  5139. """
  5140. Stem a Swedish word and return the stemmed form.
  5141. :param word: The word that is stemmed.
  5142. :type word: str or unicode
  5143. :return: The stemmed form.
  5144. :rtype: unicode
  5145. """
  5146. word = word.lower()
  5147. if word in self.stopwords:
  5148. return word
  5149. r1 = self._r1_scandinavian(word, self.__vowels)
  5150. # STEP 1
  5151. for suffix in self.__step1_suffixes:
  5152. if r1.endswith(suffix):
  5153. if suffix == "s":
  5154. if word[-2] in self.__s_ending:
  5155. word = word[:-1]
  5156. r1 = r1[:-1]
  5157. else:
  5158. word = word[: -len(suffix)]
  5159. r1 = r1[: -len(suffix)]
  5160. break
  5161. # STEP 2
  5162. for suffix in self.__step2_suffixes:
  5163. if r1.endswith(suffix):
  5164. word = word[:-1]
  5165. r1 = r1[:-1]
  5166. break
  5167. # STEP 3
  5168. for suffix in self.__step3_suffixes:
  5169. if r1.endswith(suffix):
  5170. if suffix in ("els", "lig", "ig"):
  5171. word = word[: -len(suffix)]
  5172. elif suffix in ("fullt", "l\xF6st"):
  5173. word = word[:-1]
  5174. break
  5175. return word
  5176. def demo():
  5177. """
  5178. This function provides a demonstration of the Snowball stemmers.
  5179. After invoking this function and specifying a language,
  5180. it stems an excerpt of the Universal Declaration of Human Rights
  5181. (which is a part of the NLTK corpus collection) and then prints
  5182. out the original and the stemmed text.
  5183. """
  5184. from nltk.corpus import udhr
  5185. udhr_corpus = {
  5186. "arabic": "Arabic_Alarabia-Arabic",
  5187. "danish": "Danish_Dansk-Latin1",
  5188. "dutch": "Dutch_Nederlands-Latin1",
  5189. "english": "English-Latin1",
  5190. "finnish": "Finnish_Suomi-Latin1",
  5191. "french": "French_Francais-Latin1",
  5192. "german": "German_Deutsch-Latin1",
  5193. "hungarian": "Hungarian_Magyar-UTF8",
  5194. "italian": "Italian_Italiano-Latin1",
  5195. "norwegian": "Norwegian-Latin1",
  5196. "porter": "English-Latin1",
  5197. "portuguese": "Portuguese_Portugues-Latin1",
  5198. "romanian": "Romanian_Romana-Latin2",
  5199. "russian": "Russian-UTF8",
  5200. "spanish": "Spanish-Latin1",
  5201. "swedish": "Swedish_Svenska-Latin1",
  5202. }
  5203. print("\n")
  5204. print("******************************")
  5205. print("Demo for the Snowball stemmers")
  5206. print("******************************")
  5207. while True:
  5208. language = input(
  5209. "Please enter the name of the language "
  5210. + "to be demonstrated\n"
  5211. + "/".join(SnowballStemmer.languages)
  5212. + "\n"
  5213. + "(enter 'exit' in order to leave): "
  5214. )
  5215. if language == "exit":
  5216. break
  5217. if language not in SnowballStemmer.languages:
  5218. print(
  5219. (
  5220. "\nOops, there is no stemmer for this language. "
  5221. + "Please try again.\n"
  5222. )
  5223. )
  5224. continue
  5225. stemmer = SnowballStemmer(language)
  5226. excerpt = udhr.words(udhr_corpus[language])[:300]
  5227. stemmed = " ".join(stemmer.stem(word) for word in excerpt)
  5228. stemmed = re.sub(r"(.{,70})\s", r"\1\n", stemmed + " ").rstrip()
  5229. excerpt = " ".join(excerpt)
  5230. excerpt = re.sub(r"(.{,70})\s", r"\1\n", excerpt + " ").rstrip()
  5231. print("\n")
  5232. print("-" * 70)
  5233. print("ORIGINAL".center(70))
  5234. print(excerpt)
  5235. print("\n\n")
  5236. print("STEMMED RESULTS".center(70))
  5237. print(stemmed)
  5238. print("-" * 70)
  5239. print("\n")