| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942 |
- # -*- coding: utf-8 -*-
- #
- # Natural Language Toolkit: Snowball Stemmer
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Peter Michael Stahl <pemistahl@gmail.com>
- # Peter Ljunglof <peter.ljunglof@heatherleaf.se> (revisions)
- # Lakhdar Benzahia <lakhdar.benzahia@gmail.com> (co-writer)
- # Assem Chelli <assem.ch@gmail.com> (reviewer arabicstemmer)
- # Abdelkrim Aries <ab_aries@esi.dz> (reviewer arabicstemmer)
- # Algorithms: Dr Martin Porter <martin@tartarus.org>
- # Assem Chelli <assem.ch@gmail.com> arabic stemming algorithm
- # Benzahia Lakhdar <lakhdar.benzahia@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Snowball stemmers
- This module provides a port of the Snowball stemmers
- developed by Martin Porter.
- There is also a demo function: `snowball.demo()`.
- """
- import re
- from nltk.corpus import stopwords
- from nltk.stem import porter
- from nltk.stem.util import suffix_replace, prefix_replace
- from nltk.stem.api import StemmerI
- class SnowballStemmer(StemmerI):
- """
- Snowball Stemmer
- The following languages are supported:
- Arabic, Danish, Dutch, English, Finnish, French, German,
- Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian,
- Spanish and Swedish.
- The algorithm for English is documented here:
- Porter, M. \"An algorithm for suffix stripping.\"
- Program 14.3 (1980): 130-137.
- The algorithms have been developed by Martin Porter.
- These stemmers are called Snowball, because Porter created
- a programming language with this name for creating
- new stemming algorithms. There is more information available
- at http://snowball.tartarus.org/
- The stemmer is invoked as shown below:
- >>> from nltk.stem import SnowballStemmer
- >>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported
- arabic danish dutch english finnish french german hungarian
- italian norwegian porter portuguese romanian russian
- spanish swedish
- >>> stemmer = SnowballStemmer("german") # Choose a language
- >>> stemmer.stem("Autobahnen") # Stem a word
- 'autobahn'
- Invoking the stemmers that way is useful if you do not know the
- language to be stemmed at runtime. Alternatively, if you already know
- the language, then you can invoke the language specific stemmer directly:
- >>> from nltk.stem.snowball import GermanStemmer
- >>> stemmer = GermanStemmer()
- >>> stemmer.stem("Autobahnen")
- 'autobahn'
- :param language: The language whose subclass is instantiated.
- :type language: str or unicode
- :param ignore_stopwords: If set to True, stopwords are
- not stemmed and returned unchanged.
- Set to False by default.
- :type ignore_stopwords: bool
- :raise ValueError: If there is no stemmer for the specified
- language, a ValueError is raised.
- """
- languages = (
- "arabic",
- "danish",
- "dutch",
- "english",
- "finnish",
- "french",
- "german",
- "hungarian",
- "italian",
- "norwegian",
- "porter",
- "portuguese",
- "romanian",
- "russian",
- "spanish",
- "swedish",
- )
- def __init__(self, language, ignore_stopwords=False):
- if language not in self.languages:
- raise ValueError("The language '{0}' is not supported.".format(language))
- stemmerclass = globals()[language.capitalize() + "Stemmer"]
- self.stemmer = stemmerclass(ignore_stopwords)
- self.stem = self.stemmer.stem
- self.stopwords = self.stemmer.stopwords
- def stem(self, token):
- return self.stemmer.stem(self, token)
- class _LanguageSpecificStemmer(StemmerI):
- """
- This helper subclass offers the possibility
- to invoke a specific stemmer directly.
- This is useful if you already know the language to be stemmed at runtime.
- Create an instance of the Snowball stemmer.
- :param ignore_stopwords: If set to True, stopwords are
- not stemmed and returned unchanged.
- Set to False by default.
- :type ignore_stopwords: bool
- """
- def __init__(self, ignore_stopwords=False):
- # The language is the name of the class, minus the final "Stemmer".
- language = type(self).__name__.lower()
- if language.endswith("stemmer"):
- language = language[:-7]
- self.stopwords = set()
- if ignore_stopwords:
- try:
- for word in stopwords.words(language):
- self.stopwords.add(word)
- except IOError:
- raise ValueError(
- "{!r} has no list of stopwords. Please set"
- " 'ignore_stopwords' to 'False'.".format(self)
- )
- def __repr__(self):
- """
- Print out the string representation of the respective class.
- """
- return "<{0}>".format(type(self).__name__)
- class PorterStemmer(_LanguageSpecificStemmer, porter.PorterStemmer):
- """
- A word stemmer based on the original Porter stemming algorithm.
- Porter, M. \"An algorithm for suffix stripping.\"
- Program 14.3 (1980): 130-137.
- A few minor modifications have been made to Porter's basic
- algorithm. See the source code of the module
- nltk.stem.porter for more information.
- """
- def __init__(self, ignore_stopwords=False):
- _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
- porter.PorterStemmer.__init__(self)
- class _ScandinavianStemmer(_LanguageSpecificStemmer):
- """
- This subclass encapsulates a method for defining the string region R1.
- It is used by the Danish, Norwegian, and Swedish stemmer.
- """
- def _r1_scandinavian(self, word, vowels):
- """
- Return the region R1 that is used by the Scandinavian stemmers.
- R1 is the region after the first non-vowel following a vowel,
- or is the null region at the end of the word if there is no
- such non-vowel. But then R1 is adjusted so that the region
- before it contains at least three letters.
- :param word: The word whose region R1 is determined.
- :type word: str or unicode
- :param vowels: The vowels of the respective language that are
- used to determine the region R1.
- :type vowels: unicode
- :return: the region R1 for the respective word.
- :rtype: unicode
- :note: This helper method is invoked by the respective stem method of
- the subclasses DanishStemmer, NorwegianStemmer, and
- SwedishStemmer. It is not to be invoked directly!
- """
- r1 = ""
- for i in range(1, len(word)):
- if word[i] not in vowels and word[i - 1] in vowels:
- if 3 > len(word[: i + 1]) > 0:
- r1 = word[3:]
- elif len(word[: i + 1]) >= 3:
- r1 = word[i + 1 :]
- else:
- return word
- break
- return r1
- class _StandardStemmer(_LanguageSpecificStemmer):
- """
- This subclass encapsulates two methods for defining the standard versions
- of the string regions R1, R2, and RV.
- """
- def _r1r2_standard(self, word, vowels):
- """
- Return the standard interpretations of the string regions R1 and R2.
- R1 is the region after the first non-vowel following a vowel,
- or is the null region at the end of the word if there is no
- such non-vowel.
- R2 is the region after the first non-vowel following a vowel
- in R1, or is the null region at the end of the word if there
- is no such non-vowel.
- :param word: The word whose regions R1 and R2 are determined.
- :type word: str or unicode
- :param vowels: The vowels of the respective language that are
- used to determine the regions R1 and R2.
- :type vowels: unicode
- :return: (r1,r2), the regions R1 and R2 for the respective word.
- :rtype: tuple
- :note: This helper method is invoked by the respective stem method of
- the subclasses DutchStemmer, FinnishStemmer,
- FrenchStemmer, GermanStemmer, ItalianStemmer,
- PortugueseStemmer, RomanianStemmer, and SpanishStemmer.
- It is not to be invoked directly!
- :note: A detailed description of how to define R1 and R2
- can be found at http://snowball.tartarus.org/texts/r1r2.html
- """
- r1 = ""
- r2 = ""
- for i in range(1, len(word)):
- if word[i] not in vowels and word[i - 1] in vowels:
- r1 = word[i + 1 :]
- break
- for i in range(1, len(r1)):
- if r1[i] not in vowels and r1[i - 1] in vowels:
- r2 = r1[i + 1 :]
- break
- return (r1, r2)
- def _rv_standard(self, word, vowels):
- """
- Return the standard interpretation of the string region RV.
- If the second letter is a consonant, RV is the region after the
- next following vowel. If the first two letters are vowels, RV is
- the region after the next following consonant. Otherwise, RV is
- the region after the third letter.
- :param word: The word whose region RV is determined.
- :type word: str or unicode
- :param vowels: The vowels of the respective language that are
- used to determine the region RV.
- :type vowels: unicode
- :return: the region RV for the respective word.
- :rtype: unicode
- :note: This helper method is invoked by the respective stem method of
- the subclasses ItalianStemmer, PortugueseStemmer,
- RomanianStemmer, and SpanishStemmer. It is not to be
- invoked directly!
- """
- rv = ""
- if len(word) >= 2:
- if word[1] not in vowels:
- for i in range(2, len(word)):
- if word[i] in vowels:
- rv = word[i + 1 :]
- break
- elif word[0] in vowels and word[1] in vowels:
- for i in range(2, len(word)):
- if word[i] not in vowels:
- rv = word[i + 1 :]
- break
- else:
- rv = word[3:]
- return rv
- class ArabicStemmer(_StandardStemmer):
- """
- https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm)
- The Snowball Arabic light Stemmer
- Algorithm : Assem Chelli
- Abdelkrim Aries
- Lakhdar Benzahia
- Nltk Version Author : Lakhdar Benzahia
- """
- # Normalize_pre stes
- __vocalization = re.compile(
- r"[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]"
- ) # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ
- __kasheeda = re.compile(r"[\u0640]") # ـ tatweel/kasheeda
- __arabic_punctuation_marks = re.compile(r"[\u060C-\u061B-\u061F]") # ؛ ، ؟
- # Normalize_post
- __last_hamzat = ("\u0623", "\u0625", "\u0622", "\u0624", "\u0626") # أ، إ، آ، ؤ، ئ
- # normalize other hamza's
- __initial_hamzat = re.compile(r"^[\u0622\u0623\u0625]") # أ، إ، آ
- __waw_hamza = re.compile(r"[\u0624]") # ؤ
- __yeh_hamza = re.compile(r"[\u0626]") # ئ
- __alefat = re.compile(r"[\u0623\u0622\u0625]") # أ، إ، آ
- # Checks
- __checks1 = (
- "\u0643\u0627\u0644",
- "\u0628\u0627\u0644", # بال، كال
- "\u0627\u0644",
- "\u0644\u0644", # لل، ال
- )
- __checks2 = ("\u0629", "\u0627\u062a") # ة # female plural ات
- # Suffixes
- __suffix_noun_step1a = (
- "\u064a",
- "\u0643",
- "\u0647", # ي، ك، ه
- "\u0646\u0627",
- "\u0643\u0645",
- "\u0647\u0627",
- "\u0647\u0646",
- "\u0647\u0645", # نا، كم، ها، هن، هم
- "\u0643\u0645\u0627",
- "\u0647\u0645\u0627", # كما، هما
- )
- __suffix_noun_step1b = "\u0646" # ن
- __suffix_noun_step2a = ("\u0627", "\u064a", "\u0648") # ا، ي، و
- __suffix_noun_step2b = "\u0627\u062a" # ات
- __suffix_noun_step2c1 = "\u062a" # ت
- __suffix_noun_step2c2 = "\u0629" # ة
- __suffix_noun_step3 = "\u064a" # ي
- __suffix_verb_step1 = (
- "\u0647",
- "\u0643", # ه، ك
- "\u0646\u064a",
- "\u0646\u0627",
- "\u0647\u0627",
- "\u0647\u0645", # ني، نا، ها، هم
- "\u0647\u0646",
- "\u0643\u0645",
- "\u0643\u0646", # هن، كم، كن
- "\u0647\u0645\u0627",
- "\u0643\u0645\u0627",
- "\u0643\u0645\u0648", # هما، كما، كمو
- )
- __suffix_verb_step2a = (
- "\u062a",
- "\u0627",
- "\u0646",
- "\u064a", # ت، ا، ن، ي
- "\u0646\u0627",
- "\u062a\u0627",
- "\u062a\u0646", # نا، تا، تن Past
- "\u0627\u0646",
- "\u0648\u0646",
- "\u064a\u0646", # ان، هن، ين Present
- "\u062a\u0645\u0627", # تما
- )
- __suffix_verb_step2b = ("\u0648\u0627", "\u062a\u0645") # وا، تم
- __suffix_verb_step2c = ("\u0648", "\u062a\u0645\u0648") # و # تمو
- __suffix_all_alef_maqsura = "\u0649" # ى
- # Prefixes
- __prefix_step1 = (
- "\u0623", # أ
- "\u0623\u0623",
- "\u0623\u0622",
- "\u0623\u0624",
- "\u0623\u0627",
- "\u0623\u0625", # أأ، أآ، أؤ، أا، أإ
- )
- __prefix_step2a = ("\u0641\u0627\u0644", "\u0648\u0627\u0644") # فال، وال
- __prefix_step2b = ("\u0641", "\u0648") # ف، و
- __prefix_step3a_noun = (
- "\u0627\u0644",
- "\u0644\u0644", # لل، ال
- "\u0643\u0627\u0644",
- "\u0628\u0627\u0644", # بال، كال
- )
- __prefix_step3b_noun = (
- "\u0628",
- "\u0643",
- "\u0644", # ب، ك، ل
- "\u0628\u0628",
- "\u0643\u0643", # بب، كك
- )
- __prefix_step3_verb = (
- "\u0633\u064a",
- "\u0633\u062a",
- "\u0633\u0646",
- "\u0633\u0623",
- ) # سي، ست، سن، سأ
- __prefix_step4_verb = (
- "\u064a\u0633\u062a",
- "\u0646\u0633\u062a",
- "\u062a\u0633\u062a",
- ) # يست، نست، تست
- # Suffixes added due to Conjugation Verbs
- __conjugation_suffix_verb_1 = ("\u0647", "\u0643") # ه، ك
- __conjugation_suffix_verb_2 = (
- "\u0646\u064a",
- "\u0646\u0627",
- "\u0647\u0627", # ني، نا، ها
- "\u0647\u0645",
- "\u0647\u0646",
- "\u0643\u0645", # هم، هن، كم
- "\u0643\u0646", # كن
- )
- __conjugation_suffix_verb_3 = (
- "\u0647\u0645\u0627",
- "\u0643\u0645\u0627",
- "\u0643\u0645\u0648",
- ) # هما، كما، كمو
- __conjugation_suffix_verb_4 = ("\u0627", "\u0646", "\u064a") # ا، ن، ي
- __conjugation_suffix_verb_past = (
- "\u0646\u0627",
- "\u062a\u0627",
- "\u062a\u0646",
- ) # نا، تا، تن
- __conjugation_suffix_verb_present = (
- "\u0627\u0646",
- "\u0648\u0646",
- "\u064a\u0646",
- ) # ان، ون، ين
- # Suffixes added due to derivation Names
- __conjugation_suffix_noun_1 = ("\u064a", "\u0643", "\u0647") # ي، ك، ه
- __conjugation_suffix_noun_2 = (
- "\u0646\u0627",
- "\u0643\u0645", # نا، كم
- "\u0647\u0627",
- "\u0647\u0646",
- "\u0647\u0645", # ها، هن، هم
- )
- __conjugation_suffix_noun_3 = (
- "\u0643\u0645\u0627",
- "\u0647\u0645\u0627",
- ) # كما، هما
- # Prefixes added due to derivation Names
- __prefixes1 = ("\u0648\u0627", "\u0641\u0627") # فا، وا
- __articles_3len = ("\u0643\u0627\u0644", "\u0628\u0627\u0644") # بال كال
- __articles_2len = ("\u0627\u0644", "\u0644\u0644") # ال لل
- # Prepositions letters
- __prepositions1 = ("\u0643", "\u0644") # ك، ل
- __prepositions2 = ("\u0628\u0628", "\u0643\u0643") # بب، كك
- is_verb = True
- is_noun = True
- is_defined = False
- suffixes_verb_step1_success = False
- suffix_verb_step2a_success = False
- suffix_verb_step2b_success = False
- suffix_noun_step2c2_success = False
- suffix_noun_step1a_success = False
- suffix_noun_step2a_success = False
- suffix_noun_step2b_success = False
- suffixe_noun_step1b_success = False
- prefix_step2a_success = False
- prefix_step3a_noun_success = False
- prefix_step3b_noun_success = False
- def __normalize_pre(self, token):
- """
- :param token: string
- :return: normalized token type string
- """
- # strip diacritics
- token = self.__vocalization.sub("", token)
- # strip kasheeda
- token = self.__kasheeda.sub("", token)
- # strip punctuation marks
- token = self.__arabic_punctuation_marks.sub("", token)
- return token
- def __normalize_post(self, token):
- # normalize last hamza
- for hamza in self.__last_hamzat:
- if token.endswith(hamza):
- token = suffix_replace(token, hamza, "\u0621")
- break
- # normalize other hamzat
- token = self.__initial_hamzat.sub("\u0627", token)
- token = self.__waw_hamza.sub("\u0648", token)
- token = self.__yeh_hamza.sub("\u064a", token)
- token = self.__alefat.sub("\u0627", token)
- return token
- def __checks_1(self, token):
- for prefix in self.__checks1:
- if token.startswith(prefix):
- if prefix in self.__articles_3len and len(token) > 4:
- self.is_noun = True
- self.is_verb = False
- self.is_defined = True
- break
- if prefix in self.__articles_2len and len(token) > 3:
- self.is_noun = True
- self.is_verb = False
- self.is_defined = True
- break
- def __checks_2(self, token):
- for suffix in self.__checks2:
- if token.endswith(suffix):
- if suffix == "\u0629" and len(token) > 2:
- self.is_noun = True
- self.is_verb = False
- break
- if suffix == "\u0627\u062a" and len(token) > 3:
- self.is_noun = True
- self.is_verb = False
- break
- def __Suffix_Verb_Step1(self, token):
- for suffix in self.__suffix_verb_step1:
- if token.endswith(suffix):
- if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4:
- token = token[:-1]
- self.suffixes_verb_step1_success = True
- break
- if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5:
- token = token[:-2]
- self.suffixes_verb_step1_success = True
- break
- if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6:
- token = token[:-3]
- self.suffixes_verb_step1_success = True
- break
- return token
- def __Suffix_Verb_Step2a(self, token):
- for suffix in self.__suffix_verb_step2a:
- if token.endswith(suffix) and len(token) > 3:
- if suffix == "\u062a" and len(token) >= 4:
- token = token[:-1]
- self.suffix_verb_step2a_success = True
- break
- if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4:
- token = token[:-1]
- self.suffix_verb_step2a_success = True
- break
- if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5:
- token = token[:-2] # past
- self.suffix_verb_step2a_success = True
- break
- if suffix in self.__conjugation_suffix_verb_present and len(token) > 5:
- token = token[:-2] # present
- self.suffix_verb_step2a_success = True
- break
- if suffix == "\u062a\u0645\u0627" and len(token) >= 6:
- token = token[:-3]
- self.suffix_verb_step2a_success = True
- break
- return token
- def __Suffix_Verb_Step2c(self, token):
- for suffix in self.__suffix_verb_step2c:
- if token.endswith(suffix):
- if suffix == "\u062a\u0645\u0648" and len(token) >= 6:
- token = token[:-3]
- break
- if suffix == "\u0648" and len(token) >= 4:
- token = token[:-1]
- break
- return token
- def __Suffix_Verb_Step2b(self, token):
- for suffix in self.__suffix_verb_step2b:
- if token.endswith(suffix) and len(token) >= 5:
- token = token[:-2]
- self.suffix_verb_step2b_success = True
- break
- return token
- def __Suffix_Noun_Step2c2(self, token):
- for suffix in self.__suffix_noun_step2c2:
- if token.endswith(suffix) and len(token) >= 3:
- token = token[:-1]
- self.suffix_noun_step2c2_success = True
- break
- return token
- def __Suffix_Noun_Step1a(self, token):
- for suffix in self.__suffix_noun_step1a:
- if token.endswith(suffix):
- if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4:
- token = token[:-1]
- self.suffix_noun_step1a_success = True
- break
- if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5:
- token = token[:-2]
- self.suffix_noun_step1a_success = True
- break
- if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6:
- token = token[:-3]
- self.suffix_noun_step1a_success = True
- break
- return token
- def __Suffix_Noun_Step2a(self, token):
- for suffix in self.__suffix_noun_step2a:
- if token.endswith(suffix) and len(token) > 4:
- token = token[:-1]
- self.suffix_noun_step2a_success = True
- break
- return token
- def __Suffix_Noun_Step2b(self, token):
- for suffix in self.__suffix_noun_step2b:
- if token.endswith(suffix) and len(token) >= 5:
- token = token[:-2]
- self.suffix_noun_step2b_success = True
- break
- return token
- def __Suffix_Noun_Step2c1(self, token):
- for suffix in self.__suffix_noun_step2c1:
- if token.endswith(suffix) and len(token) >= 4:
- token = token[:-1]
- break
- return token
- def __Suffix_Noun_Step1b(self, token):
- for suffix in self.__suffix_noun_step1b:
- if token.endswith(suffix) and len(token) > 5:
- token = token[:-1]
- self.suffixe_noun_step1b_success = True
- break
- return token
- def __Suffix_Noun_Step3(self, token):
- for suffix in self.__suffix_noun_step3:
- if token.endswith(suffix) and len(token) >= 3:
- token = token[:-1] # ya' nisbiya
- break
- return token
- def __Suffix_All_alef_maqsura(self, token):
- for suffix in self.__suffix_all_alef_maqsura:
- if token.endswith(suffix):
- token = suffix_replace(token, suffix, "\u064a")
- return token
- def __Prefix_Step1(self, token):
- for prefix in self.__prefix_step1:
- if token.startswith(prefix) and len(token) > 3:
- if prefix == "\u0623\u0623":
- token = prefix_replace(token, prefix, "\u0623")
- break
- elif prefix == "\u0623\u0622":
- token = prefix_replace(token, prefix, "\u0622")
- break
- elif prefix == "\u0623\u0624":
- token = prefix_replace(token, prefix, "\u0624")
- break
- elif prefix == "\u0623\u0627":
- token = prefix_replace(token, prefix, "\u0627")
- break
- elif prefix == "\u0623\u0625":
- token = prefix_replace(token, prefix, "\u0625")
- break
- return token
- def __Prefix_Step2a(self, token):
- for prefix in self.__prefix_step2a:
- if token.startswith(prefix) and len(token) > 5:
- token = token[len(prefix) :]
- self.prefix_step2a_success = True
- break
- return token
- def __Prefix_Step2b(self, token):
- for prefix in self.__prefix_step2b:
- if token.startswith(prefix) and len(token) > 3:
- if token[:2] not in self.__prefixes1:
- token = token[len(prefix) :]
- break
- return token
- def __Prefix_Step3a_Noun(self, token):
- for prefix in self.__prefix_step3a_noun:
- if token.startswith(prefix):
- if prefix in self.__articles_2len and len(token) > 4:
- token = token[len(prefix) :]
- self.prefix_step3a_noun_success = True
- break
- if prefix in self.__articles_3len and len(token) > 5:
- token = token[len(prefix) :]
- break
- return token
- def __Prefix_Step3b_Noun(self, token):
- for prefix in self.__prefix_step3b_noun:
- if token.startswith(prefix):
- if len(token) > 3:
- if prefix == "\u0628":
- token = token[len(prefix) :]
- self.prefix_step3b_noun_success = True
- break
- if prefix in self.__prepositions2:
- token = prefix_replace(token, prefix, prefix[1])
- self.prefix_step3b_noun_success = True
- break
- if prefix in self.__prepositions1 and len(token) > 4:
- token = token[len(prefix) :] # BUG: cause confusion
- self.prefix_step3b_noun_success = True
- break
- return token
- def __Prefix_Step3_Verb(self, token):
- for prefix in self.__prefix_step3_verb:
- if token.startswith(prefix) and len(token) > 4:
- token = prefix_replace(token, prefix, prefix[1])
- break
- return token
- def __Prefix_Step4_Verb(self, token):
- for prefix in self.__prefix_step4_verb:
- if token.startswith(prefix) and len(token) > 4:
- token = prefix_replace(token, prefix, "\u0627\u0633\u062a")
- self.is_verb = True
- self.is_noun = False
- break
- return token
- def stem(self, word):
- """
- Stem an Arabic word and return the stemmed form.
- :param word: string
- :return: string
- """
- # set initial values
- self.is_verb = True
- self.is_noun = True
- self.is_defined = False
- self.suffix_verb_step2a_success = False
- self.suffix_verb_step2b_success = False
- self.suffix_noun_step2c2_success = False
- self.suffix_noun_step1a_success = False
- self.suffix_noun_step2a_success = False
- self.suffix_noun_step2b_success = False
- self.suffixe_noun_step1b_success = False
- self.prefix_step2a_success = False
- self.prefix_step3a_noun_success = False
- self.prefix_step3b_noun_success = False
- modified_word = word
- # guess type and properties
- # checks1
- self.__checks_1(modified_word)
- # checks2
- self.__checks_2(modified_word)
- # Pre_Normalization
- modified_word = self.__normalize_pre(modified_word)
- # Avoid stopwords
- if modified_word in self.stopwords or len(modified_word) <= 2:
- return modified_word
- # Start stemming
- if self.is_verb:
- modified_word = self.__Suffix_Verb_Step1(modified_word)
- if self.suffixes_verb_step1_success:
- modified_word = self.__Suffix_Verb_Step2a(modified_word)
- if not self.suffix_verb_step2a_success:
- modified_word = self.__Suffix_Verb_Step2c(modified_word)
- # or next TODO: How to deal with or next instruction
- else:
- modified_word = self.__Suffix_Verb_Step2b(modified_word)
- if not self.suffix_verb_step2b_success:
- modified_word = self.__Suffix_Verb_Step2a(modified_word)
- if self.is_noun:
- modified_word = self.__Suffix_Noun_Step2c2(modified_word)
- if not self.suffix_noun_step2c2_success:
- if not self.is_defined:
- modified_word = self.__Suffix_Noun_Step1a(modified_word)
- # if self.suffix_noun_step1a_success:
- modified_word = self.__Suffix_Noun_Step2a(modified_word)
- if not self.suffix_noun_step2a_success:
- modified_word = self.__Suffix_Noun_Step2b(modified_word)
- if (
- not self.suffix_noun_step2b_success
- and not self.suffix_noun_step2a_success
- ):
- modified_word = self.__Suffix_Noun_Step2c1(modified_word)
- # or next ? todo : how to deal with or next
- else:
- modified_word = self.__Suffix_Noun_Step1b(modified_word)
- if self.suffixe_noun_step1b_success:
- modified_word = self.__Suffix_Noun_Step2a(modified_word)
- if not self.suffix_noun_step2a_success:
- modified_word = self.__Suffix_Noun_Step2b(modified_word)
- if (
- not self.suffix_noun_step2b_success
- and not self.suffix_noun_step2a_success
- ):
- modified_word = self.__Suffix_Noun_Step2c1(modified_word)
- else:
- if not self.is_defined:
- modified_word = self.__Suffix_Noun_Step2a(modified_word)
- modified_word = self.__Suffix_Noun_Step2b(modified_word)
- modified_word = self.__Suffix_Noun_Step3(modified_word)
- if not self.is_noun and self.is_verb:
- modified_word = self.__Suffix_All_alef_maqsura(modified_word)
- # prefixes
- modified_word = self.__Prefix_Step1(modified_word)
- modified_word = self.__Prefix_Step2a(modified_word)
- if not self.prefix_step2a_success:
- modified_word = self.__Prefix_Step2b(modified_word)
- modified_word = self.__Prefix_Step3a_Noun(modified_word)
- if not self.prefix_step3a_noun_success and self.is_noun:
- modified_word = self.__Prefix_Step3b_Noun(modified_word)
- else:
- if not self.prefix_step3b_noun_success and self.is_verb:
- modified_word = self.__Prefix_Step3_Verb(modified_word)
- modified_word = self.__Prefix_Step4_Verb(modified_word)
- # post normalization stemming
- modified_word = self.__normalize_post(modified_word)
- stemmed_word = modified_word
- return stemmed_word
- class DanishStemmer(_ScandinavianStemmer):
- """
- The Danish Snowball stemmer.
- :cvar __vowels: The Danish vowels.
- :type __vowels: unicode
- :cvar __consonants: The Danish consonants.
- :type __consonants: unicode
- :cvar __double_consonants: The Danish double consonants.
- :type __double_consonants: tuple
- :cvar __s_ending: Letters that may directly appear before a word final 's'.
- :type __s_ending: unicode
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :note: A detailed description of the Danish
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/danish/stemmer.html
- """
- # The language's vowels and other important characters are defined.
- __vowels = "aeiouy\xE6\xE5\xF8"
- __consonants = "bcdfghjklmnpqrstvwxz"
- __double_consonants = (
- "bb",
- "cc",
- "dd",
- "ff",
- "gg",
- "hh",
- "jj",
- "kk",
- "ll",
- "mm",
- "nn",
- "pp",
- "qq",
- "rr",
- "ss",
- "tt",
- "vv",
- "ww",
- "xx",
- "zz",
- )
- __s_ending = "abcdfghjklmnoprtvyz\xE5"
- # The different suffixes, divided into the algorithm's steps
- # and organized by length, are listed in tuples.
- __step1_suffixes = (
- "erendes",
- "erende",
- "hedens",
- "ethed",
- "erede",
- "heden",
- "heder",
- "endes",
- "ernes",
- "erens",
- "erets",
- "ered",
- "ende",
- "erne",
- "eren",
- "erer",
- "heds",
- "enes",
- "eres",
- "eret",
- "hed",
- "ene",
- "ere",
- "ens",
- "ers",
- "ets",
- "en",
- "er",
- "es",
- "et",
- "e",
- "s",
- )
- __step2_suffixes = ("gd", "dt", "gt", "kt")
- __step3_suffixes = ("elig", "l\xF8st", "lig", "els", "ig")
- def stem(self, word):
- """
- Stem a Danish word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- # Every word is put into lower case for normalization.
- word = word.lower()
- if word in self.stopwords:
- return word
- # After this, the required regions are generated
- # by the respective helper method.
- r1 = self._r1_scandinavian(word, self.__vowels)
- # Then the actual stemming process starts.
- # Every new step is explicitly indicated
- # according to the descriptions on the Snowball website.
- # STEP 1
- for suffix in self.__step1_suffixes:
- if r1.endswith(suffix):
- if suffix == "s":
- if word[-2] in self.__s_ending:
- word = word[:-1]
- r1 = r1[:-1]
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- break
- # STEP 2
- for suffix in self.__step2_suffixes:
- if r1.endswith(suffix):
- word = word[:-1]
- r1 = r1[:-1]
- break
- # STEP 3
- if r1.endswith("igst"):
- word = word[:-2]
- r1 = r1[:-2]
- for suffix in self.__step3_suffixes:
- if r1.endswith(suffix):
- if suffix == "l\xF8st":
- word = word[:-1]
- r1 = r1[:-1]
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- if r1.endswith(self.__step2_suffixes):
- word = word[:-1]
- r1 = r1[:-1]
- break
- # STEP 4: Undouble
- for double_cons in self.__double_consonants:
- if word.endswith(double_cons) and len(word) > 3:
- word = word[:-1]
- break
- return word
- class DutchStemmer(_StandardStemmer):
- """
- The Dutch Snowball stemmer.
- :cvar __vowels: The Dutch vowels.
- :type __vowels: unicode
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm.
- :type __step3b_suffixes: tuple
- :note: A detailed description of the Dutch
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/dutch/stemmer.html
- """
- __vowels = "aeiouy\xE8"
- __step1_suffixes = ("heden", "ene", "en", "se", "s")
- __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig")
- def stem(self, word):
- """
- Stem a Dutch word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.stopwords:
- return word
- step2_success = False
- # Vowel accents are removed.
- word = (
- word.replace("\xE4", "a")
- .replace("\xE1", "a")
- .replace("\xEB", "e")
- .replace("\xE9", "e")
- .replace("\xED", "i")
- .replace("\xEF", "i")
- .replace("\xF6", "o")
- .replace("\xF3", "o")
- .replace("\xFC", "u")
- .replace("\xFA", "u")
- )
- # An initial 'y', a 'y' after a vowel,
- # and an 'i' between self.__vowels is put into upper case.
- # As from now these are treated as consonants.
- if word.startswith("y"):
- word = "".join(("Y", word[1:]))
- for i in range(1, len(word)):
- if word[i - 1] in self.__vowels and word[i] == "y":
- word = "".join((word[:i], "Y", word[i + 1 :]))
- for i in range(1, len(word) - 1):
- if (
- word[i - 1] in self.__vowels
- and word[i] == "i"
- and word[i + 1] in self.__vowels
- ):
- word = "".join((word[:i], "I", word[i + 1 :]))
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- # R1 is adjusted so that the region before it
- # contains at least 3 letters.
- for i in range(1, len(word)):
- if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
- if 3 > len(word[: i + 1]) > 0:
- r1 = word[3:]
- elif len(word[: i + 1]) == 0:
- return word
- break
- # STEP 1
- for suffix in self.__step1_suffixes:
- if r1.endswith(suffix):
- if suffix == "heden":
- word = suffix_replace(word, suffix, "heid")
- r1 = suffix_replace(r1, suffix, "heid")
- if r2.endswith("heden"):
- r2 = suffix_replace(r2, suffix, "heid")
- elif (
- suffix in ("ene", "en")
- and not word.endswith("heden")
- and word[-len(suffix) - 1] not in self.__vowels
- and word[-len(suffix) - 3 : -len(suffix)] != "gem"
- ):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- if word.endswith(("kk", "dd", "tt")):
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- elif (
- suffix in ("se", "s")
- and word[-len(suffix) - 1] not in self.__vowels
- and word[-len(suffix) - 1] != "j"
- ):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- break
- # STEP 2
- if r1.endswith("e") and word[-2] not in self.__vowels:
- step2_success = True
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- if word.endswith(("kk", "dd", "tt")):
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- # STEP 3a
- if r2.endswith("heid") and word[-5] != "c":
- word = word[:-4]
- r1 = r1[:-4]
- r2 = r2[:-4]
- if (
- r1.endswith("en")
- and word[-3] not in self.__vowels
- and word[-5:-2] != "gem"
- ):
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- if word.endswith(("kk", "dd", "tt")):
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- # STEP 3b: Derivational suffixes
- for suffix in self.__step3b_suffixes:
- if r2.endswith(suffix):
- if suffix in ("end", "ing"):
- word = word[:-3]
- r2 = r2[:-3]
- if r2.endswith("ig") and word[-3] != "e":
- word = word[:-2]
- else:
- if word.endswith(("kk", "dd", "tt")):
- word = word[:-1]
- elif suffix == "ig" and word[-3] != "e":
- word = word[:-2]
- elif suffix == "lijk":
- word = word[:-4]
- r1 = r1[:-4]
- if r1.endswith("e") and word[-2] not in self.__vowels:
- word = word[:-1]
- if word.endswith(("kk", "dd", "tt")):
- word = word[:-1]
- elif suffix == "baar":
- word = word[:-4]
- elif suffix == "bar" and step2_success:
- word = word[:-3]
- break
- # STEP 4: Undouble vowel
- if len(word) >= 4:
- if word[-1] not in self.__vowels and word[-1] != "I":
- if word[-3:-1] in ("aa", "ee", "oo", "uu"):
- if word[-4] not in self.__vowels:
- word = "".join((word[:-3], word[-3], word[-1]))
- # All occurrences of 'I' and 'Y' are put back into lower case.
- word = word.replace("I", "i").replace("Y", "y")
- return word
- class EnglishStemmer(_StandardStemmer):
- """
- The English Snowball stemmer.
- :cvar __vowels: The English vowels.
- :type __vowels: unicode
- :cvar __double_consonants: The English double consonants.
- :type __double_consonants: tuple
- :cvar __li_ending: Letters that may directly appear before a word final 'li'.
- :type __li_ending: unicode
- :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
- :type __step0_suffixes: tuple
- :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm.
- :type __step1a_suffixes: tuple
- :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm.
- :type __step1b_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
- :type __step4_suffixes: tuple
- :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
- :type __step5_suffixes: tuple
- :cvar __special_words: A dictionary containing words
- which have to be stemmed specially.
- :type __special_words: dict
- :note: A detailed description of the English
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/english/stemmer.html
- """
- __vowels = "aeiouy"
- __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
- __li_ending = "cdeghkmnrt"
- __step0_suffixes = ("'s'", "'s", "'")
- __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s")
- __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed")
- __step2_suffixes = (
- "ization",
- "ational",
- "fulness",
- "ousness",
- "iveness",
- "tional",
- "biliti",
- "lessli",
- "entli",
- "ation",
- "alism",
- "aliti",
- "ousli",
- "iviti",
- "fulli",
- "enci",
- "anci",
- "abli",
- "izer",
- "ator",
- "alli",
- "bli",
- "ogi",
- "li",
- )
- __step3_suffixes = (
- "ational",
- "tional",
- "alize",
- "icate",
- "iciti",
- "ative",
- "ical",
- "ness",
- "ful",
- )
- __step4_suffixes = (
- "ement",
- "ance",
- "ence",
- "able",
- "ible",
- "ment",
- "ant",
- "ent",
- "ism",
- "ate",
- "iti",
- "ous",
- "ive",
- "ize",
- "ion",
- "al",
- "er",
- "ic",
- )
- __step5_suffixes = ("e", "l")
- __special_words = {
- "skis": "ski",
- "skies": "sky",
- "dying": "die",
- "lying": "lie",
- "tying": "tie",
- "idly": "idl",
- "gently": "gentl",
- "ugly": "ugli",
- "early": "earli",
- "only": "onli",
- "singly": "singl",
- "sky": "sky",
- "news": "news",
- "howe": "howe",
- "atlas": "atlas",
- "cosmos": "cosmos",
- "bias": "bias",
- "andes": "andes",
- "inning": "inning",
- "innings": "inning",
- "outing": "outing",
- "outings": "outing",
- "canning": "canning",
- "cannings": "canning",
- "herring": "herring",
- "herrings": "herring",
- "earring": "earring",
- "earrings": "earring",
- "proceed": "proceed",
- "proceeds": "proceed",
- "proceeded": "proceed",
- "proceeding": "proceed",
- "exceed": "exceed",
- "exceeds": "exceed",
- "exceeded": "exceed",
- "exceeding": "exceed",
- "succeed": "succeed",
- "succeeds": "succeed",
- "succeeded": "succeed",
- "succeeding": "succeed",
- }
- def stem(self, word):
- """
- Stem an English word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.stopwords or len(word) <= 2:
- return word
- elif word in self.__special_words:
- return self.__special_words[word]
- # Map the different apostrophe characters to a single consistent one
- word = (
- word.replace("\u2019", "\x27")
- .replace("\u2018", "\x27")
- .replace("\u201B", "\x27")
- )
- if word.startswith("\x27"):
- word = word[1:]
- if word.startswith("y"):
- word = "".join(("Y", word[1:]))
- for i in range(1, len(word)):
- if word[i - 1] in self.__vowels and word[i] == "y":
- word = "".join((word[:i], "Y", word[i + 1 :]))
- step1a_vowel_found = False
- step1b_vowel_found = False
- r1 = ""
- r2 = ""
- if word.startswith(("gener", "commun", "arsen")):
- if word.startswith(("gener", "arsen")):
- r1 = word[5:]
- else:
- r1 = word[6:]
- for i in range(1, len(r1)):
- if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels:
- r2 = r1[i + 1 :]
- break
- else:
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- # STEP 0
- for suffix in self.__step0_suffixes:
- if word.endswith(suffix):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- break
- # STEP 1a
- for suffix in self.__step1a_suffixes:
- if word.endswith(suffix):
- if suffix == "sses":
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix in ("ied", "ies"):
- if len(word[: -len(suffix)]) > 1:
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- else:
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- elif suffix == "s":
- for letter in word[:-2]:
- if letter in self.__vowels:
- step1a_vowel_found = True
- break
- if step1a_vowel_found:
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- break
- # STEP 1b
- for suffix in self.__step1b_suffixes:
- if word.endswith(suffix):
- if suffix in ("eed", "eedly"):
- if r1.endswith(suffix):
- word = suffix_replace(word, suffix, "ee")
- if len(r1) >= len(suffix):
- r1 = suffix_replace(r1, suffix, "ee")
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = suffix_replace(r2, suffix, "ee")
- else:
- r2 = ""
- else:
- for letter in word[: -len(suffix)]:
- if letter in self.__vowels:
- step1b_vowel_found = True
- break
- if step1b_vowel_found:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- if word.endswith(("at", "bl", "iz")):
- word = "".join((word, "e"))
- r1 = "".join((r1, "e"))
- if len(word) > 5 or len(r1) >= 3:
- r2 = "".join((r2, "e"))
- elif word.endswith(self.__double_consonants):
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- elif (
- r1 == ""
- and len(word) >= 3
- and word[-1] not in self.__vowels
- and word[-1] not in "wxY"
- and word[-2] in self.__vowels
- and word[-3] not in self.__vowels
- ) or (
- r1 == ""
- and len(word) == 2
- and word[0] in self.__vowels
- and word[1] not in self.__vowels
- ):
- word = "".join((word, "e"))
- if len(r1) > 0:
- r1 = "".join((r1, "e"))
- if len(r2) > 0:
- r2 = "".join((r2, "e"))
- break
- # STEP 1c
- if len(word) > 2 and word[-1] in "yY" and word[-2] not in self.__vowels:
- word = "".join((word[:-1], "i"))
- if len(r1) >= 1:
- r1 = "".join((r1[:-1], "i"))
- else:
- r1 = ""
- if len(r2) >= 1:
- r2 = "".join((r2[:-1], "i"))
- else:
- r2 = ""
- # STEP 2
- for suffix in self.__step2_suffixes:
- if word.endswith(suffix):
- if r1.endswith(suffix):
- if suffix == "tional":
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix in ("enci", "anci", "abli"):
- word = "".join((word[:-1], "e"))
- if len(r1) >= 1:
- r1 = "".join((r1[:-1], "e"))
- else:
- r1 = ""
- if len(r2) >= 1:
- r2 = "".join((r2[:-1], "e"))
- else:
- r2 = ""
- elif suffix == "entli":
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix in ("izer", "ization"):
- word = suffix_replace(word, suffix, "ize")
- if len(r1) >= len(suffix):
- r1 = suffix_replace(r1, suffix, "ize")
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = suffix_replace(r2, suffix, "ize")
- else:
- r2 = ""
- elif suffix in ("ational", "ation", "ator"):
- word = suffix_replace(word, suffix, "ate")
- if len(r1) >= len(suffix):
- r1 = suffix_replace(r1, suffix, "ate")
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = suffix_replace(r2, suffix, "ate")
- else:
- r2 = "e"
- elif suffix in ("alism", "aliti", "alli"):
- word = suffix_replace(word, suffix, "al")
- if len(r1) >= len(suffix):
- r1 = suffix_replace(r1, suffix, "al")
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = suffix_replace(r2, suffix, "al")
- else:
- r2 = ""
- elif suffix == "fulness":
- word = word[:-4]
- r1 = r1[:-4]
- r2 = r2[:-4]
- elif suffix in ("ousli", "ousness"):
- word = suffix_replace(word, suffix, "ous")
- if len(r1) >= len(suffix):
- r1 = suffix_replace(r1, suffix, "ous")
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = suffix_replace(r2, suffix, "ous")
- else:
- r2 = ""
- elif suffix in ("iveness", "iviti"):
- word = suffix_replace(word, suffix, "ive")
- if len(r1) >= len(suffix):
- r1 = suffix_replace(r1, suffix, "ive")
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = suffix_replace(r2, suffix, "ive")
- else:
- r2 = "e"
- elif suffix in ("biliti", "bli"):
- word = suffix_replace(word, suffix, "ble")
- if len(r1) >= len(suffix):
- r1 = suffix_replace(r1, suffix, "ble")
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = suffix_replace(r2, suffix, "ble")
- else:
- r2 = ""
- elif suffix == "ogi" and word[-4] == "l":
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- elif suffix in ("fulli", "lessli"):
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix == "li" and word[-3] in self.__li_ending:
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- break
- # STEP 3
- for suffix in self.__step3_suffixes:
- if word.endswith(suffix):
- if r1.endswith(suffix):
- if suffix == "tional":
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix == "ational":
- word = suffix_replace(word, suffix, "ate")
- if len(r1) >= len(suffix):
- r1 = suffix_replace(r1, suffix, "ate")
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = suffix_replace(r2, suffix, "ate")
- else:
- r2 = ""
- elif suffix == "alize":
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- elif suffix in ("icate", "iciti", "ical"):
- word = suffix_replace(word, suffix, "ic")
- if len(r1) >= len(suffix):
- r1 = suffix_replace(r1, suffix, "ic")
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = suffix_replace(r2, suffix, "ic")
- else:
- r2 = ""
- elif suffix in ("ful", "ness"):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- elif suffix == "ative" and r2.endswith(suffix):
- word = word[:-5]
- r1 = r1[:-5]
- r2 = r2[:-5]
- break
- # STEP 4
- for suffix in self.__step4_suffixes:
- if word.endswith(suffix):
- if r2.endswith(suffix):
- if suffix == "ion":
- if word[-4] in "st":
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- break
- # STEP 5
- if r2.endswith("l") and word[-2] == "l":
- word = word[:-1]
- elif r2.endswith("e"):
- word = word[:-1]
- elif r1.endswith("e"):
- if len(word) >= 4 and (
- word[-2] in self.__vowels
- or word[-2] in "wxY"
- or word[-3] not in self.__vowels
- or word[-4] in self.__vowels
- ):
- word = word[:-1]
- word = word.replace("Y", "y")
- return word
- class FinnishStemmer(_StandardStemmer):
- """
- The Finnish Snowball stemmer.
- :cvar __vowels: The Finnish vowels.
- :type __vowels: unicode
- :cvar __restricted_vowels: A subset of the Finnish vowels.
- :type __restricted_vowels: unicode
- :cvar __long_vowels: The Finnish vowels in their long forms.
- :type __long_vowels: tuple
- :cvar __consonants: The Finnish consonants.
- :type __consonants: unicode
- :cvar __double_consonants: The Finnish double consonants.
- :type __double_consonants: tuple
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
- :type __step4_suffixes: tuple
- :note: A detailed description of the Finnish
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/finnish/stemmer.html
- """
- __vowels = "aeiouy\xE4\xF6"
- __restricted_vowels = "aeiou\xE4\xF6"
- __long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4", "\xF6\xF6")
- __consonants = "bcdfghjklmnpqrstvwxz"
- __double_consonants = (
- "bb",
- "cc",
- "dd",
- "ff",
- "gg",
- "hh",
- "jj",
- "kk",
- "ll",
- "mm",
- "nn",
- "pp",
- "qq",
- "rr",
- "ss",
- "tt",
- "vv",
- "ww",
- "xx",
- "zz",
- )
- __step1_suffixes = (
- "kaan",
- "k\xE4\xE4n",
- "sti",
- "kin",
- "han",
- "h\xE4n",
- "ko",
- "k\xF6",
- "pa",
- "p\xE4",
- )
- __step2_suffixes = ("nsa", "ns\xE4", "mme", "nne", "si", "ni", "an", "\xE4n", "en")
- __step3_suffixes = (
- "siin",
- "tten",
- "seen",
- "han",
- "hen",
- "hin",
- "hon",
- "h\xE4n",
- "h\xF6n",
- "den",
- "tta",
- "tt\xE4",
- "ssa",
- "ss\xE4",
- "sta",
- "st\xE4",
- "lla",
- "ll\xE4",
- "lta",
- "lt\xE4",
- "lle",
- "ksi",
- "ine",
- "ta",
- "t\xE4",
- "na",
- "n\xE4",
- "a",
- "\xE4",
- "n",
- )
- __step4_suffixes = (
- "impi",
- "impa",
- "imp\xE4",
- "immi",
- "imma",
- "imm\xE4",
- "mpi",
- "mpa",
- "mp\xE4",
- "mmi",
- "mma",
- "mm\xE4",
- "eja",
- "ej\xE4",
- )
- def stem(self, word):
- """
- Stem a Finnish word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.stopwords:
- return word
- step3_success = False
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- # STEP 1: Particles etc.
- for suffix in self.__step1_suffixes:
- if r1.endswith(suffix):
- if suffix == "sti":
- if suffix in r2:
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- else:
- if word[-len(suffix) - 1] in "ntaeiouy\xE4\xF6":
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- break
- # STEP 2: Possessives
- for suffix in self.__step2_suffixes:
- if r1.endswith(suffix):
- if suffix == "si":
- if word[-3] != "k":
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix == "ni":
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- if word.endswith("kse"):
- word = suffix_replace(word, "kse", "ksi")
- if r1.endswith("kse"):
- r1 = suffix_replace(r1, "kse", "ksi")
- if r2.endswith("kse"):
- r2 = suffix_replace(r2, "kse", "ksi")
- elif suffix == "an":
- if word[-4:-2] in ("ta", "na") or word[-5:-2] in (
- "ssa",
- "sta",
- "lla",
- "lta",
- ):
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix == "\xE4n":
- if word[-4:-2] in ("t\xE4", "n\xE4") or word[-5:-2] in (
- "ss\xE4",
- "st\xE4",
- "ll\xE4",
- "lt\xE4",
- ):
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix == "en":
- if word[-5:-2] in ("lle", "ine"):
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- else:
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- break
- # STEP 3: Cases
- for suffix in self.__step3_suffixes:
- if r1.endswith(suffix):
- if suffix in ("han", "hen", "hin", "hon", "h\xE4n", "h\xF6n"):
- if (
- (suffix == "han" and word[-4] == "a")
- or (suffix == "hen" and word[-4] == "e")
- or (suffix == "hin" and word[-4] == "i")
- or (suffix == "hon" and word[-4] == "o")
- or (suffix == "h\xE4n" and word[-4] == "\xE4")
- or (suffix == "h\xF6n" and word[-4] == "\xF6")
- ):
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- step3_success = True
- elif suffix in ("siin", "den", "tten"):
- if (
- word[-len(suffix) - 1] == "i"
- and word[-len(suffix) - 2] in self.__restricted_vowels
- ):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- step3_success = True
- else:
- continue
- elif suffix == "seen":
- if word[-6:-4] in self.__long_vowels:
- word = word[:-4]
- r1 = r1[:-4]
- r2 = r2[:-4]
- step3_success = True
- else:
- continue
- elif suffix in ("a", "\xE4"):
- if word[-2] in self.__vowels and word[-3] in self.__consonants:
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- step3_success = True
- elif suffix in ("tta", "tt\xE4"):
- if word[-4] == "e":
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- step3_success = True
- elif suffix == "n":
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- step3_success = True
- if word[-2:] == "ie" or word[-2:] in self.__long_vowels:
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- step3_success = True
- break
- # STEP 4: Other endings
- for suffix in self.__step4_suffixes:
- if r2.endswith(suffix):
- if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma", "mm\xE4"):
- if word[-5:-3] != "po":
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- break
- # STEP 5: Plurals
- if step3_success and len(r1) >= 1 and r1[-1] in "ij":
- word = word[:-1]
- r1 = r1[:-1]
- elif (
- not step3_success
- and len(r1) >= 2
- and r1[-1] == "t"
- and r1[-2] in self.__vowels
- ):
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- if r2.endswith("imma"):
- word = word[:-4]
- r1 = r1[:-4]
- elif r2.endswith("mma") and r2[-5:-3] != "po":
- word = word[:-3]
- r1 = r1[:-3]
- # STEP 6: Tidying up
- if r1[-2:] in self.__long_vowels:
- word = word[:-1]
- r1 = r1[:-1]
- if len(r1) >= 2 and r1[-2] in self.__consonants and r1[-1] in "a\xE4ei":
- word = word[:-1]
- r1 = r1[:-1]
- if r1.endswith(("oj", "uj")):
- word = word[:-1]
- r1 = r1[:-1]
- if r1.endswith("jo"):
- word = word[:-1]
- r1 = r1[:-1]
- # If the word ends with a double consonant
- # followed by zero or more vowels, the last consonant is removed.
- for i in range(1, len(word)):
- if word[-i] in self.__vowels:
- continue
- else:
- if i == 1:
- if word[-i - 1 :] in self.__double_consonants:
- word = word[:-1]
- else:
- if word[-i - 1 : -i + 1] in self.__double_consonants:
- word = "".join((word[:-i], word[-i + 1 :]))
- break
- return word
- class FrenchStemmer(_StandardStemmer):
- """
- The French Snowball stemmer.
- :cvar __vowels: The French vowels.
- :type __vowels: unicode
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
- :type __step2a_suffixes: tuple
- :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
- :type __step2b_suffixes: tuple
- :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
- :type __step4_suffixes: tuple
- :note: A detailed description of the French
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/french/stemmer.html
- """
- __vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9"
- __step1_suffixes = (
- "issements",
- "issement",
- "atrices",
- "atrice",
- "ateurs",
- "ations",
- "logies",
- "usions",
- "utions",
- "ements",
- "amment",
- "emment",
- "ances",
- "iqUes",
- "ismes",
- "ables",
- "istes",
- "ateur",
- "ation",
- "logie",
- "usion",
- "ution",
- "ences",
- "ement",
- "euses",
- "ments",
- "ance",
- "iqUe",
- "isme",
- "able",
- "iste",
- "ence",
- "it\xE9s",
- "ives",
- "eaux",
- "euse",
- "ment",
- "eux",
- "it\xE9",
- "ive",
- "ifs",
- "aux",
- "if",
- )
- __step2a_suffixes = (
- "issaIent",
- "issantes",
- "iraIent",
- "issante",
- "issants",
- "issions",
- "irions",
- "issais",
- "issait",
- "issant",
- "issent",
- "issiez",
- "issons",
- "irais",
- "irait",
- "irent",
- "iriez",
- "irons",
- "iront",
- "isses",
- "issez",
- "\xEEmes",
- "\xEEtes",
- "irai",
- "iras",
- "irez",
- "isse",
- "ies",
- "ira",
- "\xEEt",
- "ie",
- "ir",
- "is",
- "it",
- "i",
- )
- __step2b_suffixes = (
- "eraIent",
- "assions",
- "erions",
- "assent",
- "assiez",
- "\xE8rent",
- "erais",
- "erait",
- "eriez",
- "erons",
- "eront",
- "aIent",
- "antes",
- "asses",
- "ions",
- "erai",
- "eras",
- "erez",
- "\xE2mes",
- "\xE2tes",
- "ante",
- "ants",
- "asse",
- "\xE9es",
- "era",
- "iez",
- "ais",
- "ait",
- "ant",
- "\xE9e",
- "\xE9s",
- "er",
- "ez",
- "\xE2t",
- "ai",
- "as",
- "\xE9",
- "a",
- )
- __step4_suffixes = ("i\xE8re", "I\xE8re", "ion", "ier", "Ier", "e", "\xEB")
- def stem(self, word):
- """
- Stem a French word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.stopwords:
- return word
- step1_success = False
- rv_ending_found = False
- step2a_success = False
- step2b_success = False
- # Every occurrence of 'u' after 'q' is put into upper case.
- for i in range(1, len(word)):
- if word[i - 1] == "q" and word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1 :]))
- # Every occurrence of 'u' and 'i'
- # between vowels is put into upper case.
- # Every occurrence of 'y' preceded or
- # followed by a vowel is also put into upper case.
- for i in range(1, len(word) - 1):
- if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
- if word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1 :]))
- elif word[i] == "i":
- word = "".join((word[:i], "I", word[i + 1 :]))
- if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels:
- if word[i] == "y":
- word = "".join((word[:i], "Y", word[i + 1 :]))
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- rv = self.__rv_french(word, self.__vowels)
- # STEP 1: Standard suffix removal
- for suffix in self.__step1_suffixes:
- if word.endswith(suffix):
- if suffix == "eaux":
- word = word[:-1]
- step1_success = True
- elif suffix in ("euse", "euses"):
- if suffix in r2:
- word = word[: -len(suffix)]
- step1_success = True
- elif suffix in r1:
- word = suffix_replace(word, suffix, "eux")
- step1_success = True
- elif suffix in ("ement", "ements") and suffix in rv:
- word = word[: -len(suffix)]
- step1_success = True
- if word[-2:] == "iv" and "iv" in r2:
- word = word[:-2]
- if word[-2:] == "at" and "at" in r2:
- word = word[:-2]
- elif word[-3:] == "eus":
- if "eus" in r2:
- word = word[:-3]
- elif "eus" in r1:
- word = "".join((word[:-1], "x"))
- elif word[-3:] in ("abl", "iqU"):
- if "abl" in r2 or "iqU" in r2:
- word = word[:-3]
- elif word[-3:] in ("i\xE8r", "I\xE8r"):
- if "i\xE8r" in rv or "I\xE8r" in rv:
- word = "".join((word[:-3], "i"))
- elif suffix == "amment" and suffix in rv:
- word = suffix_replace(word, "amment", "ant")
- rv = suffix_replace(rv, "amment", "ant")
- rv_ending_found = True
- elif suffix == "emment" and suffix in rv:
- word = suffix_replace(word, "emment", "ent")
- rv_ending_found = True
- elif (
- suffix in ("ment", "ments")
- and suffix in rv
- and not rv.startswith(suffix)
- and rv[rv.rindex(suffix) - 1] in self.__vowels
- ):
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
- rv_ending_found = True
- elif suffix == "aux" and suffix in r1:
- word = "".join((word[:-2], "l"))
- step1_success = True
- elif (
- suffix in ("issement", "issements")
- and suffix in r1
- and word[-len(suffix) - 1] not in self.__vowels
- ):
- word = word[: -len(suffix)]
- step1_success = True
- elif (
- suffix
- in (
- "ance",
- "iqUe",
- "isme",
- "able",
- "iste",
- "eux",
- "ances",
- "iqUes",
- "ismes",
- "ables",
- "istes",
- )
- and suffix in r2
- ):
- word = word[: -len(suffix)]
- step1_success = True
- elif (
- suffix
- in ("atrice", "ateur", "ation", "atrices", "ateurs", "ations")
- and suffix in r2
- ):
- word = word[: -len(suffix)]
- step1_success = True
- if word[-2:] == "ic":
- if "ic" in r2:
- word = word[:-2]
- else:
- word = "".join((word[:-2], "iqU"))
- elif suffix in ("logie", "logies") and suffix in r2:
- word = suffix_replace(word, suffix, "log")
- step1_success = True
- elif suffix in ("usion", "ution", "usions", "utions") and suffix in r2:
- word = suffix_replace(word, suffix, "u")
- step1_success = True
- elif suffix in ("ence", "ences") and suffix in r2:
- word = suffix_replace(word, suffix, "ent")
- step1_success = True
- elif suffix in ("it\xE9", "it\xE9s") and suffix in r2:
- word = word[: -len(suffix)]
- step1_success = True
- if word[-4:] == "abil":
- if "abil" in r2:
- word = word[:-4]
- else:
- word = "".join((word[:-2], "l"))
- elif word[-2:] == "ic":
- if "ic" in r2:
- word = word[:-2]
- else:
- word = "".join((word[:-2], "iqU"))
- elif word[-2:] == "iv":
- if "iv" in r2:
- word = word[:-2]
- elif suffix in ("if", "ive", "ifs", "ives") and suffix in r2:
- word = word[: -len(suffix)]
- step1_success = True
- if word[-2:] == "at" and "at" in r2:
- word = word[:-2]
- if word[-2:] == "ic":
- if "ic" in r2:
- word = word[:-2]
- else:
- word = "".join((word[:-2], "iqU"))
- break
- # STEP 2a: Verb suffixes beginning 'i'
- if not step1_success or rv_ending_found:
- for suffix in self.__step2a_suffixes:
- if word.endswith(suffix):
- if (
- suffix in rv
- and len(rv) > len(suffix)
- and rv[rv.rindex(suffix) - 1] not in self.__vowels
- ):
- word = word[: -len(suffix)]
- step2a_success = True
- break
- # STEP 2b: Other verb suffixes
- if not step2a_success:
- for suffix in self.__step2b_suffixes:
- if rv.endswith(suffix):
- if suffix == "ions" and "ions" in r2:
- word = word[:-4]
- step2b_success = True
- elif suffix in (
- "eraIent",
- "erions",
- "\xE8rent",
- "erais",
- "erait",
- "eriez",
- "erons",
- "eront",
- "erai",
- "eras",
- "erez",
- "\xE9es",
- "era",
- "iez",
- "\xE9e",
- "\xE9s",
- "er",
- "ez",
- "\xE9",
- ):
- word = word[: -len(suffix)]
- step2b_success = True
- elif suffix in (
- "assions",
- "assent",
- "assiez",
- "aIent",
- "antes",
- "asses",
- "\xE2mes",
- "\xE2tes",
- "ante",
- "ants",
- "asse",
- "ais",
- "ait",
- "ant",
- "\xE2t",
- "ai",
- "as",
- "a",
- ):
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
- step2b_success = True
- if rv.endswith("e"):
- word = word[:-1]
- break
- # STEP 3
- if step1_success or step2a_success or step2b_success:
- if word[-1] == "Y":
- word = "".join((word[:-1], "i"))
- elif word[-1] == "\xE7":
- word = "".join((word[:-1], "c"))
- # STEP 4: Residual suffixes
- else:
- if len(word) >= 2 and word[-1] == "s" and word[-2] not in "aiou\xE8s":
- word = word[:-1]
- for suffix in self.__step4_suffixes:
- if word.endswith(suffix):
- if suffix in rv:
- if suffix == "ion" and suffix in r2 and rv[-4] in "st":
- word = word[:-3]
- elif suffix in ("ier", "i\xE8re", "Ier", "I\xE8re"):
- word = suffix_replace(word, suffix, "i")
- elif suffix == "e":
- word = word[:-1]
- elif suffix == "\xEB" and word[-3:-1] == "gu":
- word = word[:-1]
- break
- # STEP 5: Undouble
- if word.endswith(("enn", "onn", "ett", "ell", "eill")):
- word = word[:-1]
- # STEP 6: Un-accent
- for i in range(1, len(word)):
- if word[-i] not in self.__vowels:
- i += 1
- else:
- if i != 1 and word[-i] in ("\xE9", "\xE8"):
- word = "".join((word[:-i], "e", word[-i + 1 :]))
- break
- word = word.replace("I", "i").replace("U", "u").replace("Y", "y")
- return word
- def __rv_french(self, word, vowels):
- """
- Return the region RV that is used by the French stemmer.
- If the word begins with two vowels, RV is the region after
- the third letter. Otherwise, it is the region after the first
- vowel not at the beginning of the word, or the end of the word
- if these positions cannot be found. (Exceptionally, u'par',
- u'col' or u'tap' at the beginning of a word is also taken to
- define RV as the region to their right.)
- :param word: The French word whose region RV is determined.
- :type word: str or unicode
- :param vowels: The French vowels that are used to determine
- the region RV.
- :type vowels: unicode
- :return: the region RV for the respective French word.
- :rtype: unicode
- :note: This helper method is invoked by the stem method of
- the subclass FrenchStemmer. It is not to be invoked directly!
- """
- rv = ""
- if len(word) >= 2:
- if word.startswith(("par", "col", "tap")) or (
- word[0] in vowels and word[1] in vowels
- ):
- rv = word[3:]
- else:
- for i in range(1, len(word)):
- if word[i] in vowels:
- rv = word[i + 1 :]
- break
- return rv
- class GermanStemmer(_StandardStemmer):
- """
- The German Snowball stemmer.
- :cvar __vowels: The German vowels.
- :type __vowels: unicode
- :cvar __s_ending: Letters that may directly appear before a word final 's'.
- :type __s_ending: unicode
- :cvar __st_ending: Letter that may directly appear before a word final 'st'.
- :type __st_ending: unicode
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :note: A detailed description of the German
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/german/stemmer.html
- """
- __vowels = "aeiouy\xE4\xF6\xFC"
- __s_ending = "bdfghklmnrt"
- __st_ending = "bdfghklmnt"
- __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s")
- __step2_suffixes = ("est", "en", "er", "st")
- __step3_suffixes = ("isch", "lich", "heit", "keit", "end", "ung", "ig", "ik")
- def stem(self, word):
- """
- Stem a German word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.stopwords:
- return word
- word = word.replace("\xDF", "ss")
- # Every occurrence of 'u' and 'y'
- # between vowels is put into upper case.
- for i in range(1, len(word) - 1):
- if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
- if word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1 :]))
- elif word[i] == "y":
- word = "".join((word[:i], "Y", word[i + 1 :]))
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- # R1 is adjusted so that the region before it
- # contains at least 3 letters.
- for i in range(1, len(word)):
- if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
- if 3 > len(word[: i + 1]) > 0:
- r1 = word[3:]
- elif len(word[: i + 1]) == 0:
- return word
- break
- # STEP 1
- for suffix in self.__step1_suffixes:
- if r1.endswith(suffix):
- if (
- suffix in ("en", "es", "e")
- and word[-len(suffix) - 4 : -len(suffix)] == "niss"
- ):
- word = word[: -len(suffix) - 1]
- r1 = r1[: -len(suffix) - 1]
- r2 = r2[: -len(suffix) - 1]
- elif suffix == "s":
- if word[-2] in self.__s_ending:
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- break
- # STEP 2
- for suffix in self.__step2_suffixes:
- if r1.endswith(suffix):
- if suffix == "st":
- if word[-3] in self.__st_ending and len(word[:-3]) >= 3:
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- break
- # STEP 3: Derivational suffixes
- for suffix in self.__step3_suffixes:
- if r2.endswith(suffix):
- if suffix in ("end", "ung"):
- if (
- "ig" in r2[-len(suffix) - 2 : -len(suffix)]
- and "e" not in r2[-len(suffix) - 3 : -len(suffix) - 2]
- ):
- word = word[: -len(suffix) - 2]
- else:
- word = word[: -len(suffix)]
- elif (
- suffix in ("ig", "ik", "isch")
- and "e" not in r2[-len(suffix) - 1 : -len(suffix)]
- ):
- word = word[: -len(suffix)]
- elif suffix in ("lich", "heit"):
- if (
- "er" in r1[-len(suffix) - 2 : -len(suffix)]
- or "en" in r1[-len(suffix) - 2 : -len(suffix)]
- ):
- word = word[: -len(suffix) - 2]
- else:
- word = word[: -len(suffix)]
- elif suffix == "keit":
- if "lich" in r2[-len(suffix) - 4 : -len(suffix)]:
- word = word[: -len(suffix) - 4]
- elif "ig" in r2[-len(suffix) - 2 : -len(suffix)]:
- word = word[: -len(suffix) - 2]
- else:
- word = word[: -len(suffix)]
- break
- # Umlaut accents are removed and
- # 'u' and 'y' are put back into lower case.
- word = (
- word.replace("\xE4", "a")
- .replace("\xF6", "o")
- .replace("\xFC", "u")
- .replace("U", "u")
- .replace("Y", "y")
- )
- return word
- class HungarianStemmer(_LanguageSpecificStemmer):
- """
- The Hungarian Snowball stemmer.
- :cvar __vowels: The Hungarian vowels.
- :type __vowels: unicode
- :cvar __digraphs: The Hungarian digraphs.
- :type __digraphs: tuple
- :cvar __double_consonants: The Hungarian double consonants.
- :type __double_consonants: tuple
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
- :type __step4_suffixes: tuple
- :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
- :type __step5_suffixes: tuple
- :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm.
- :type __step6_suffixes: tuple
- :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm.
- :type __step7_suffixes: tuple
- :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm.
- :type __step8_suffixes: tuple
- :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm.
- :type __step9_suffixes: tuple
- :note: A detailed description of the Hungarian
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/hungarian/stemmer.html
- """
- __vowels = "aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB"
- __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs")
- __double_consonants = (
- "bb",
- "cc",
- "ccs",
- "dd",
- "ff",
- "gg",
- "ggy",
- "jj",
- "kk",
- "ll",
- "lly",
- "mm",
- "nn",
- "nny",
- "pp",
- "rr",
- "ss",
- "ssz",
- "tt",
- "tty",
- "vv",
- "zz",
- "zzs",
- )
- __step1_suffixes = ("al", "el")
- __step2_suffixes = (
- "k\xE9ppen",
- "onk\xE9nt",
- "enk\xE9nt",
- "ank\xE9nt",
- "k\xE9pp",
- "k\xE9nt",
- "ban",
- "ben",
- "nak",
- "nek",
- "val",
- "vel",
- "t\xF3l",
- "t\xF5l",
- "r\xF3l",
- "r\xF5l",
- "b\xF3l",
- "b\xF5l",
- "hoz",
- "hez",
- "h\xF6z",
- "n\xE1l",
- "n\xE9l",
- "\xE9rt",
- "kor",
- "ba",
- "be",
- "ra",
- "re",
- "ig",
- "at",
- "et",
- "ot",
- "\xF6t",
- "ul",
- "\xFCl",
- "v\xE1",
- "v\xE9",
- "en",
- "on",
- "an",
- "\xF6n",
- "n",
- "t",
- )
- __step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n")
- __step4_suffixes = (
- "astul",
- "est\xFCl",
- "\xE1stul",
- "\xE9st\xFCl",
- "stul",
- "st\xFCl",
- )
- __step5_suffixes = ("\xE1", "\xE9")
- __step6_suffixes = (
- "ok\xE9",
- "\xF6k\xE9",
- "ak\xE9",
- "ek\xE9",
- "\xE1k\xE9",
- "\xE1\xE9i",
- "\xE9k\xE9",
- "\xE9\xE9i",
- "k\xE9",
- "\xE9i",
- "\xE9\xE9",
- "\xE9",
- )
- __step7_suffixes = (
- "\xE1juk",
- "\xE9j\xFCk",
- "\xFCnk",
- "unk",
- "juk",
- "j\xFCk",
- "\xE1nk",
- "\xE9nk",
- "nk",
- "uk",
- "\xFCk",
- "em",
- "om",
- "am",
- "od",
- "ed",
- "ad",
- "\xF6d",
- "ja",
- "je",
- "\xE1m",
- "\xE1d",
- "\xE9m",
- "\xE9d",
- "m",
- "d",
- "a",
- "e",
- "o",
- "\xE1",
- "\xE9",
- )
- __step8_suffixes = (
- "jaitok",
- "jeitek",
- "jaink",
- "jeink",
- "aitok",
- "eitek",
- "\xE1itok",
- "\xE9itek",
- "jaim",
- "jeim",
- "jaid",
- "jeid",
- "eink",
- "aink",
- "itek",
- "jeik",
- "jaik",
- "\xE1ink",
- "\xE9ink",
- "aim",
- "eim",
- "aid",
- "eid",
- "jai",
- "jei",
- "ink",
- "aik",
- "eik",
- "\xE1im",
- "\xE1id",
- "\xE1ik",
- "\xE9im",
- "\xE9id",
- "\xE9ik",
- "im",
- "id",
- "ai",
- "ei",
- "ik",
- "\xE1i",
- "\xE9i",
- "i",
- )
- __step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", "ek", "ak", "k")
- def stem(self, word):
- """
- Stem an Hungarian word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.stopwords:
- return word
- r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs)
- # STEP 1: Remove instrumental case
- if r1.endswith(self.__step1_suffixes):
- for double_cons in self.__double_consonants:
- if word[-2 - len(double_cons) : -2] == double_cons:
- word = "".join((word[:-4], word[-3]))
- if r1[-2 - len(double_cons) : -2] == double_cons:
- r1 = "".join((r1[:-4], r1[-3]))
- break
- # STEP 2: Remove frequent cases
- for suffix in self.__step2_suffixes:
- if word.endswith(suffix):
- if r1.endswith(suffix):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- if r1.endswith("\xE1"):
- word = "".join((word[:-1], "a"))
- r1 = suffix_replace(r1, "\xE1", "a")
- elif r1.endswith("\xE9"):
- word = "".join((word[:-1], "e"))
- r1 = suffix_replace(r1, "\xE9", "e")
- break
- # STEP 3: Remove special cases
- for suffix in self.__step3_suffixes:
- if r1.endswith(suffix):
- if suffix == "\xE9n":
- word = suffix_replace(word, suffix, "e")
- r1 = suffix_replace(r1, suffix, "e")
- else:
- word = suffix_replace(word, suffix, "a")
- r1 = suffix_replace(r1, suffix, "a")
- break
- # STEP 4: Remove other cases
- for suffix in self.__step4_suffixes:
- if r1.endswith(suffix):
- if suffix == "\xE1stul":
- word = suffix_replace(word, suffix, "a")
- r1 = suffix_replace(r1, suffix, "a")
- elif suffix == "\xE9st\xFCl":
- word = suffix_replace(word, suffix, "e")
- r1 = suffix_replace(r1, suffix, "e")
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- break
- # STEP 5: Remove factive case
- for suffix in self.__step5_suffixes:
- if r1.endswith(suffix):
- for double_cons in self.__double_consonants:
- if word[-1 - len(double_cons) : -1] == double_cons:
- word = "".join((word[:-3], word[-2]))
- if r1[-1 - len(double_cons) : -1] == double_cons:
- r1 = "".join((r1[:-3], r1[-2]))
- break
- # STEP 6: Remove owned
- for suffix in self.__step6_suffixes:
- if r1.endswith(suffix):
- if suffix in ("\xE1k\xE9", "\xE1\xE9i"):
- word = suffix_replace(word, suffix, "a")
- r1 = suffix_replace(r1, suffix, "a")
- elif suffix in ("\xE9k\xE9", "\xE9\xE9i", "\xE9\xE9"):
- word = suffix_replace(word, suffix, "e")
- r1 = suffix_replace(r1, suffix, "e")
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- break
- # STEP 7: Remove singular owner suffixes
- for suffix in self.__step7_suffixes:
- if word.endswith(suffix):
- if r1.endswith(suffix):
- if suffix in ("\xE1nk", "\xE1juk", "\xE1m", "\xE1d", "\xE1"):
- word = suffix_replace(word, suffix, "a")
- r1 = suffix_replace(r1, suffix, "a")
- elif suffix in ("\xE9nk", "\xE9j\xFCk", "\xE9m", "\xE9d", "\xE9"):
- word = suffix_replace(word, suffix, "e")
- r1 = suffix_replace(r1, suffix, "e")
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- break
- # STEP 8: Remove plural owner suffixes
- for suffix in self.__step8_suffixes:
- if word.endswith(suffix):
- if r1.endswith(suffix):
- if suffix in (
- "\xE1im",
- "\xE1id",
- "\xE1i",
- "\xE1ink",
- "\xE1itok",
- "\xE1ik",
- ):
- word = suffix_replace(word, suffix, "a")
- r1 = suffix_replace(r1, suffix, "a")
- elif suffix in (
- "\xE9im",
- "\xE9id",
- "\xE9i",
- "\xE9ink",
- "\xE9itek",
- "\xE9ik",
- ):
- word = suffix_replace(word, suffix, "e")
- r1 = suffix_replace(r1, suffix, "e")
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- break
- # STEP 9: Remove plural suffixes
- for suffix in self.__step9_suffixes:
- if word.endswith(suffix):
- if r1.endswith(suffix):
- if suffix == "\xE1k":
- word = suffix_replace(word, suffix, "a")
- elif suffix == "\xE9k":
- word = suffix_replace(word, suffix, "e")
- else:
- word = word[: -len(suffix)]
- break
- return word
- def __r1_hungarian(self, word, vowels, digraphs):
- """
- Return the region R1 that is used by the Hungarian stemmer.
- If the word begins with a vowel, R1 is defined as the region
- after the first consonant or digraph (= two letters stand for
- one phoneme) in the word. If the word begins with a consonant,
- it is defined as the region after the first vowel in the word.
- If the word does not contain both a vowel and consonant, R1
- is the null region at the end of the word.
- :param word: The Hungarian word whose region R1 is determined.
- :type word: str or unicode
- :param vowels: The Hungarian vowels that are used to determine
- the region R1.
- :type vowels: unicode
- :param digraphs: The digraphs that are used to determine the
- region R1.
- :type digraphs: tuple
- :return: the region R1 for the respective word.
- :rtype: unicode
- :note: This helper method is invoked by the stem method of the subclass
- HungarianStemmer. It is not to be invoked directly!
- """
- r1 = ""
- if word[0] in vowels:
- for digraph in digraphs:
- if digraph in word[1:]:
- r1 = word[word.index(digraph[-1]) + 1 :]
- return r1
- for i in range(1, len(word)):
- if word[i] not in vowels:
- r1 = word[i + 1 :]
- break
- else:
- for i in range(1, len(word)):
- if word[i] in vowels:
- r1 = word[i + 1 :]
- break
- return r1
- class ItalianStemmer(_StandardStemmer):
- """
- The Italian Snowball stemmer.
- :cvar __vowels: The Italian vowels.
- :type __vowels: unicode
- :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
- :type __step0_suffixes: tuple
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :note: A detailed description of the Italian
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/italian/stemmer.html
- """
- __vowels = "aeiou\xE0\xE8\xEC\xF2\xF9"
- __step0_suffixes = (
- "gliela",
- "gliele",
- "glieli",
- "glielo",
- "gliene",
- "sene",
- "mela",
- "mele",
- "meli",
- "melo",
- "mene",
- "tela",
- "tele",
- "teli",
- "telo",
- "tene",
- "cela",
- "cele",
- "celi",
- "celo",
- "cene",
- "vela",
- "vele",
- "veli",
- "velo",
- "vene",
- "gli",
- "ci",
- "la",
- "le",
- "li",
- "lo",
- "mi",
- "ne",
- "si",
- "ti",
- "vi",
- )
- __step1_suffixes = (
- "atrice",
- "atrici",
- "azione",
- "azioni",
- "uzione",
- "uzioni",
- "usione",
- "usioni",
- "amento",
- "amenti",
- "imento",
- "imenti",
- "amente",
- "abile",
- "abili",
- "ibile",
- "ibili",
- "mente",
- "atore",
- "atori",
- "logia",
- "logie",
- "anza",
- "anze",
- "iche",
- "ichi",
- "ismo",
- "ismi",
- "ista",
- "iste",
- "isti",
- "ist\xE0",
- "ist\xE8",
- "ist\xEC",
- "ante",
- "anti",
- "enza",
- "enze",
- "ico",
- "ici",
- "ica",
- "ice",
- "oso",
- "osi",
- "osa",
- "ose",
- "it\xE0",
- "ivo",
- "ivi",
- "iva",
- "ive",
- )
- __step2_suffixes = (
- "erebbero",
- "irebbero",
- "assero",
- "assimo",
- "eranno",
- "erebbe",
- "eremmo",
- "ereste",
- "eresti",
- "essero",
- "iranno",
- "irebbe",
- "iremmo",
- "ireste",
- "iresti",
- "iscano",
- "iscono",
- "issero",
- "arono",
- "avamo",
- "avano",
- "avate",
- "eremo",
- "erete",
- "erono",
- "evamo",
- "evano",
- "evate",
- "iremo",
- "irete",
- "irono",
- "ivamo",
- "ivano",
- "ivate",
- "ammo",
- "ando",
- "asse",
- "assi",
- "emmo",
- "enda",
- "ende",
- "endi",
- "endo",
- "erai",
- "erei",
- "Yamo",
- "iamo",
- "immo",
- "irai",
- "irei",
- "isca",
- "isce",
- "isci",
- "isco",
- "ano",
- "are",
- "ata",
- "ate",
- "ati",
- "ato",
- "ava",
- "avi",
- "avo",
- "er\xE0",
- "ere",
- "er\xF2",
- "ete",
- "eva",
- "evi",
- "evo",
- "ir\xE0",
- "ire",
- "ir\xF2",
- "ita",
- "ite",
- "iti",
- "ito",
- "iva",
- "ivi",
- "ivo",
- "ono",
- "uta",
- "ute",
- "uti",
- "uto",
- "ar",
- "ir",
- )
- def stem(self, word):
- """
- Stem an Italian word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.stopwords:
- return word
- step1_success = False
- # All acute accents are replaced by grave accents.
- word = (
- word.replace("\xE1", "\xE0")
- .replace("\xE9", "\xE8")
- .replace("\xED", "\xEC")
- .replace("\xF3", "\xF2")
- .replace("\xFA", "\xF9")
- )
- # Every occurrence of 'u' after 'q'
- # is put into upper case.
- for i in range(1, len(word)):
- if word[i - 1] == "q" and word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1 :]))
- # Every occurrence of 'u' and 'i'
- # between vowels is put into upper case.
- for i in range(1, len(word) - 1):
- if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
- if word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1 :]))
- elif word[i] == "i":
- word = "".join((word[:i], "I", word[i + 1 :]))
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- rv = self._rv_standard(word, self.__vowels)
- # STEP 0: Attached pronoun
- for suffix in self.__step0_suffixes:
- if rv.endswith(suffix):
- if rv[-len(suffix) - 4 : -len(suffix)] in ("ando", "endo"):
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- elif rv[-len(suffix) - 2 : -len(suffix)] in ("ar", "er", "ir"):
- word = suffix_replace(word, suffix, "e")
- r1 = suffix_replace(r1, suffix, "e")
- r2 = suffix_replace(r2, suffix, "e")
- rv = suffix_replace(rv, suffix, "e")
- break
- # STEP 1: Standard suffix removal
- for suffix in self.__step1_suffixes:
- if word.endswith(suffix):
- if suffix == "amente" and r1.endswith(suffix):
- step1_success = True
- word = word[:-6]
- r2 = r2[:-6]
- rv = rv[:-6]
- if r2.endswith("iv"):
- word = word[:-2]
- r2 = r2[:-2]
- rv = rv[:-2]
- if r2.endswith("at"):
- word = word[:-2]
- rv = rv[:-2]
- elif r2.endswith(("os", "ic")):
- word = word[:-2]
- rv = rv[:-2]
- elif r2.endswith("abil"):
- word = word[:-4]
- rv = rv[:-4]
- elif suffix in ("amento", "amenti", "imento", "imenti") and rv.endswith(
- suffix
- ):
- step1_success = True
- word = word[:-6]
- rv = rv[:-6]
- elif r2.endswith(suffix):
- step1_success = True
- if suffix in ("azione", "azioni", "atore", "atori"):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- if r2.endswith("ic"):
- word = word[:-2]
- rv = rv[:-2]
- elif suffix in ("logia", "logie"):
- word = word[:-2]
- rv = word[:-2]
- elif suffix in ("uzione", "uzioni", "usione", "usioni"):
- word = word[:-5]
- rv = rv[:-5]
- elif suffix in ("enza", "enze"):
- word = suffix_replace(word, suffix, "te")
- rv = suffix_replace(rv, suffix, "te")
- elif suffix == "it\xE0":
- word = word[:-3]
- r2 = r2[:-3]
- rv = rv[:-3]
- if r2.endswith(("ic", "iv")):
- word = word[:-2]
- rv = rv[:-2]
- elif r2.endswith("abil"):
- word = word[:-4]
- rv = rv[:-4]
- elif suffix in ("ivo", "ivi", "iva", "ive"):
- word = word[:-3]
- r2 = r2[:-3]
- rv = rv[:-3]
- if r2.endswith("at"):
- word = word[:-2]
- r2 = r2[:-2]
- rv = rv[:-2]
- if r2.endswith("ic"):
- word = word[:-2]
- rv = rv[:-2]
- else:
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
- break
- # STEP 2: Verb suffixes
- if not step1_success:
- for suffix in self.__step2_suffixes:
- if rv.endswith(suffix):
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
- break
- # STEP 3a
- if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8", "\xEC", "\xF2")):
- word = word[:-1]
- rv = rv[:-1]
- if rv.endswith("i"):
- word = word[:-1]
- rv = rv[:-1]
- # STEP 3b
- if rv.endswith(("ch", "gh")):
- word = word[:-1]
- word = word.replace("I", "i").replace("U", "u")
- return word
- class NorwegianStemmer(_ScandinavianStemmer):
- """
- The Norwegian Snowball stemmer.
- :cvar __vowels: The Norwegian vowels.
- :type __vowels: unicode
- :cvar __s_ending: Letters that may directly appear before a word final 's'.
- :type __s_ending: unicode
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :note: A detailed description of the Norwegian
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
- """
- __vowels = "aeiouy\xE6\xE5\xF8"
- __s_ending = "bcdfghjlmnoprtvyz"
- __step1_suffixes = (
- "hetenes",
- "hetene",
- "hetens",
- "heter",
- "heten",
- "endes",
- "ande",
- "ende",
- "edes",
- "enes",
- "erte",
- "ede",
- "ane",
- "ene",
- "ens",
- "ers",
- "ets",
- "het",
- "ast",
- "ert",
- "en",
- "ar",
- "er",
- "as",
- "es",
- "et",
- "a",
- "e",
- "s",
- )
- __step2_suffixes = ("dt", "vt")
- __step3_suffixes = (
- "hetslov",
- "eleg",
- "elig",
- "elov",
- "slov",
- "leg",
- "eig",
- "lig",
- "els",
- "lov",
- "ig",
- )
- def stem(self, word):
- """
- Stem a Norwegian word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.stopwords:
- return word
- r1 = self._r1_scandinavian(word, self.__vowels)
- # STEP 1
- for suffix in self.__step1_suffixes:
- if r1.endswith(suffix):
- if suffix in ("erte", "ert"):
- word = suffix_replace(word, suffix, "er")
- r1 = suffix_replace(r1, suffix, "er")
- elif suffix == "s":
- if word[-2] in self.__s_ending or (
- word[-2] == "k" and word[-3] not in self.__vowels
- ):
- word = word[:-1]
- r1 = r1[:-1]
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- break
- # STEP 2
- for suffix in self.__step2_suffixes:
- if r1.endswith(suffix):
- word = word[:-1]
- r1 = r1[:-1]
- break
- # STEP 3
- for suffix in self.__step3_suffixes:
- if r1.endswith(suffix):
- word = word[: -len(suffix)]
- break
- return word
- class PortugueseStemmer(_StandardStemmer):
- """
- The Portuguese Snowball stemmer.
- :cvar __vowels: The Portuguese vowels.
- :type __vowels: unicode
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
- :type __step4_suffixes: tuple
- :note: A detailed description of the Portuguese
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/portuguese/stemmer.html
- """
- __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4"
- __step1_suffixes = (
- "amentos",
- "imentos",
- "uço~es",
- "amento",
- "imento",
- "adoras",
- "adores",
- "a\xE7o~es",
- "logias",
- "\xEAncias",
- "amente",
- "idades",
- "an\xE7as",
- "ismos",
- "istas",
- "adora",
- "a\xE7a~o",
- "antes",
- "\xE2ncia",
- "logia",
- "uça~o",
- "\xEAncia",
- "mente",
- "idade",
- "an\xE7a",
- "ezas",
- "icos",
- "icas",
- "ismo",
- "\xE1vel",
- "\xEDvel",
- "ista",
- "osos",
- "osas",
- "ador",
- "ante",
- "ivas",
- "ivos",
- "iras",
- "eza",
- "ico",
- "ica",
- "oso",
- "osa",
- "iva",
- "ivo",
- "ira",
- )
- __step2_suffixes = (
- "ar\xEDamos",
- "er\xEDamos",
- "ir\xEDamos",
- "\xE1ssemos",
- "\xEAssemos",
- "\xEDssemos",
- "ar\xEDeis",
- "er\xEDeis",
- "ir\xEDeis",
- "\xE1sseis",
- "\xE9sseis",
- "\xEDsseis",
- "\xE1ramos",
- "\xE9ramos",
- "\xEDramos",
- "\xE1vamos",
- "aremos",
- "eremos",
- "iremos",
- "ariam",
- "eriam",
- "iriam",
- "assem",
- "essem",
- "issem",
- "ara~o",
- "era~o",
- "ira~o",
- "arias",
- "erias",
- "irias",
- "ardes",
- "erdes",
- "irdes",
- "asses",
- "esses",
- "isses",
- "astes",
- "estes",
- "istes",
- "\xE1reis",
- "areis",
- "\xE9reis",
- "ereis",
- "\xEDreis",
- "ireis",
- "\xE1veis",
- "\xEDamos",
- "armos",
- "ermos",
- "irmos",
- "aria",
- "eria",
- "iria",
- "asse",
- "esse",
- "isse",
- "aste",
- "este",
- "iste",
- "arei",
- "erei",
- "irei",
- "aram",
- "eram",
- "iram",
- "avam",
- "arem",
- "erem",
- "irem",
- "ando",
- "endo",
- "indo",
- "adas",
- "idas",
- "ar\xE1s",
- "aras",
- "er\xE1s",
- "eras",
- "ir\xE1s",
- "avas",
- "ares",
- "eres",
- "ires",
- "\xEDeis",
- "ados",
- "idos",
- "\xE1mos",
- "amos",
- "emos",
- "imos",
- "iras",
- "ada",
- "ida",
- "ar\xE1",
- "ara",
- "er\xE1",
- "era",
- "ir\xE1",
- "ava",
- "iam",
- "ado",
- "ido",
- "ias",
- "ais",
- "eis",
- "ira",
- "ia",
- "ei",
- "am",
- "em",
- "ar",
- "er",
- "ir",
- "as",
- "es",
- "is",
- "eu",
- "iu",
- "ou",
- )
- __step4_suffixes = ("os", "a", "i", "o", "\xE1", "\xED", "\xF3")
- def stem(self, word):
- """
- Stem a Portuguese word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.stopwords:
- return word
- step1_success = False
- step2_success = False
- word = (
- word.replace("\xE3", "a~")
- .replace("\xF5", "o~")
- .replace("q\xFC", "qu")
- .replace("g\xFC", "gu")
- )
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- rv = self._rv_standard(word, self.__vowels)
- # STEP 1: Standard suffix removal
- for suffix in self.__step1_suffixes:
- if word.endswith(suffix):
- if suffix == "amente" and r1.endswith(suffix):
- step1_success = True
- word = word[:-6]
- r2 = r2[:-6]
- rv = rv[:-6]
- if r2.endswith("iv"):
- word = word[:-2]
- r2 = r2[:-2]
- rv = rv[:-2]
- if r2.endswith("at"):
- word = word[:-2]
- rv = rv[:-2]
- elif r2.endswith(("os", "ic", "ad")):
- word = word[:-2]
- rv = rv[:-2]
- elif (
- suffix in ("ira", "iras")
- and rv.endswith(suffix)
- and word[-len(suffix) - 1 : -len(suffix)] == "e"
- ):
- step1_success = True
- word = suffix_replace(word, suffix, "ir")
- rv = suffix_replace(rv, suffix, "ir")
- elif r2.endswith(suffix):
- step1_success = True
- if suffix in ("logia", "logias"):
- word = suffix_replace(word, suffix, "log")
- rv = suffix_replace(rv, suffix, "log")
- elif suffix in ("uça~o", "uço~es"):
- word = suffix_replace(word, suffix, "u")
- rv = suffix_replace(rv, suffix, "u")
- elif suffix in ("\xEAncia", "\xEAncias"):
- word = suffix_replace(word, suffix, "ente")
- rv = suffix_replace(rv, suffix, "ente")
- elif suffix == "mente":
- word = word[:-5]
- r2 = r2[:-5]
- rv = rv[:-5]
- if r2.endswith(("ante", "avel", "ivel")):
- word = word[:-4]
- rv = rv[:-4]
- elif suffix in ("idade", "idades"):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- if r2.endswith(("ic", "iv")):
- word = word[:-2]
- rv = rv[:-2]
- elif r2.endswith("abil"):
- word = word[:-4]
- rv = rv[:-4]
- elif suffix in ("iva", "ivo", "ivas", "ivos"):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- if r2.endswith("at"):
- word = word[:-2]
- rv = rv[:-2]
- else:
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
- break
- # STEP 2: Verb suffixes
- if not step1_success:
- for suffix in self.__step2_suffixes:
- if rv.endswith(suffix):
- step2_success = True
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
- break
- # STEP 3
- if step1_success or step2_success:
- if rv.endswith("i") and word[-2] == "c":
- word = word[:-1]
- rv = rv[:-1]
- ### STEP 4: Residual suffix
- if not step1_success and not step2_success:
- for suffix in self.__step4_suffixes:
- if rv.endswith(suffix):
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
- break
- # STEP 5
- if rv.endswith(("e", "\xE9", "\xEA")):
- word = word[:-1]
- rv = rv[:-1]
- if (word.endswith("gu") and rv.endswith("u")) or (
- word.endswith("ci") and rv.endswith("i")
- ):
- word = word[:-1]
- elif word.endswith("\xE7"):
- word = suffix_replace(word, "\xE7", "c")
- word = word.replace("a~", "\xE3").replace("o~", "\xF5")
- return word
- class RomanianStemmer(_StandardStemmer):
- """
- The Romanian Snowball stemmer.
- :cvar __vowels: The Romanian vowels.
- :type __vowels: unicode
- :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
- :type __step0_suffixes: tuple
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :note: A detailed description of the Romanian
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/romanian/stemmer.html
- """
- __vowels = "aeiou\u0103\xE2\xEE"
- __step0_suffixes = (
- "iilor",
- "ului",
- "elor",
- "iile",
- "ilor",
- "atei",
- "a\u0163ie",
- "a\u0163ia",
- "aua",
- "ele",
- "iua",
- "iei",
- "ile",
- "ul",
- "ea",
- "ii",
- )
- __step1_suffixes = (
- "abilitate",
- "abilitati",
- "abilit\u0103\u0163i",
- "ibilitate",
- "abilit\u0103i",
- "ivitate",
- "ivitati",
- "ivit\u0103\u0163i",
- "icitate",
- "icitati",
- "icit\u0103\u0163i",
- "icatori",
- "ivit\u0103i",
- "icit\u0103i",
- "icator",
- "a\u0163iune",
- "atoare",
- "\u0103toare",
- "i\u0163iune",
- "itoare",
- "iciva",
- "icive",
- "icivi",
- "iciv\u0103",
- "icala",
- "icale",
- "icali",
- "ical\u0103",
- "ativa",
- "ative",
- "ativi",
- "ativ\u0103",
- "atori",
- "\u0103tori",
- "itiva",
- "itive",
- "itivi",
- "itiv\u0103",
- "itori",
- "iciv",
- "ical",
- "ativ",
- "ator",
- "\u0103tor",
- "itiv",
- "itor",
- )
- __step2_suffixes = (
- "abila",
- "abile",
- "abili",
- "abil\u0103",
- "ibila",
- "ibile",
- "ibili",
- "ibil\u0103",
- "atori",
- "itate",
- "itati",
- "it\u0103\u0163i",
- "abil",
- "ibil",
- "oasa",
- "oas\u0103",
- "oase",
- "anta",
- "ante",
- "anti",
- "ant\u0103",
- "ator",
- "it\u0103i",
- "iune",
- "iuni",
- "isme",
- "ista",
- "iste",
- "isti",
- "ist\u0103",
- "i\u015Fti",
- "ata",
- "at\u0103",
- "ati",
- "ate",
- "uta",
- "ut\u0103",
- "uti",
- "ute",
- "ita",
- "it\u0103",
- "iti",
- "ite",
- "ica",
- "ice",
- "ici",
- "ic\u0103",
- "osi",
- "o\u015Fi",
- "ant",
- "iva",
- "ive",
- "ivi",
- "iv\u0103",
- "ism",
- "ist",
- "at",
- "ut",
- "it",
- "ic",
- "os",
- "iv",
- )
- __step3_suffixes = (
- "seser\u0103\u0163i",
- "aser\u0103\u0163i",
- "iser\u0103\u0163i",
- "\xE2ser\u0103\u0163i",
- "user\u0103\u0163i",
- "seser\u0103m",
- "aser\u0103m",
- "iser\u0103m",
- "\xE2ser\u0103m",
- "user\u0103m",
- "ser\u0103\u0163i",
- "sese\u015Fi",
- "seser\u0103",
- "easc\u0103",
- "ar\u0103\u0163i",
- "ur\u0103\u0163i",
- "ir\u0103\u0163i",
- "\xE2r\u0103\u0163i",
- "ase\u015Fi",
- "aser\u0103",
- "ise\u015Fi",
- "iser\u0103",
- "\xe2se\u015Fi",
- "\xE2ser\u0103",
- "use\u015Fi",
- "user\u0103",
- "ser\u0103m",
- "sesem",
- "indu",
- "\xE2ndu",
- "eaz\u0103",
- "e\u015Fti",
- "e\u015Fte",
- "\u0103\u015Fti",
- "\u0103\u015Fte",
- "ea\u0163i",
- "ia\u0163i",
- "ar\u0103m",
- "ur\u0103m",
- "ir\u0103m",
- "\xE2r\u0103m",
- "asem",
- "isem",
- "\xE2sem",
- "usem",
- "se\u015Fi",
- "ser\u0103",
- "sese",
- "are",
- "ere",
- "ire",
- "\xE2re",
- "ind",
- "\xE2nd",
- "eze",
- "ezi",
- "esc",
- "\u0103sc",
- "eam",
- "eai",
- "eau",
- "iam",
- "iai",
- "iau",
- "a\u015Fi",
- "ar\u0103",
- "u\u015Fi",
- "ur\u0103",
- "i\u015Fi",
- "ir\u0103",
- "\xE2\u015Fi",
- "\xe2r\u0103",
- "ase",
- "ise",
- "\xE2se",
- "use",
- "a\u0163i",
- "e\u0163i",
- "i\u0163i",
- "\xe2\u0163i",
- "sei",
- "ez",
- "am",
- "ai",
- "au",
- "ea",
- "ia",
- "ui",
- "\xE2i",
- "\u0103m",
- "em",
- "im",
- "\xE2m",
- "se",
- )
- def stem(self, word):
- """
- Stem a Romanian word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.stopwords:
- return word
- step1_success = False
- step2_success = False
- for i in range(1, len(word) - 1):
- if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
- if word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1 :]))
- elif word[i] == "i":
- word = "".join((word[:i], "I", word[i + 1 :]))
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- rv = self._rv_standard(word, self.__vowels)
- # STEP 0: Removal of plurals and other simplifications
- for suffix in self.__step0_suffixes:
- if word.endswith(suffix):
- if suffix in r1:
- if suffix in ("ul", "ului"):
- word = word[: -len(suffix)]
- if suffix in rv:
- rv = rv[: -len(suffix)]
- else:
- rv = ""
- elif (
- suffix == "aua"
- or suffix == "atei"
- or (suffix == "ile" and word[-5:-3] != "ab")
- ):
- word = word[:-2]
- elif suffix in ("ea", "ele", "elor"):
- word = suffix_replace(word, suffix, "e")
- if suffix in rv:
- rv = suffix_replace(rv, suffix, "e")
- else:
- rv = ""
- elif suffix in ("ii", "iua", "iei", "iile", "iilor", "ilor"):
- word = suffix_replace(word, suffix, "i")
- if suffix in rv:
- rv = suffix_replace(rv, suffix, "i")
- else:
- rv = ""
- elif suffix in ("a\u0163ie", "a\u0163ia"):
- word = word[:-1]
- break
- # STEP 1: Reduction of combining suffixes
- while True:
- replacement_done = False
- for suffix in self.__step1_suffixes:
- if word.endswith(suffix):
- if suffix in r1:
- step1_success = True
- replacement_done = True
- if suffix in (
- "abilitate",
- "abilitati",
- "abilit\u0103i",
- "abilit\u0103\u0163i",
- ):
- word = suffix_replace(word, suffix, "abil")
- elif suffix == "ibilitate":
- word = word[:-5]
- elif suffix in (
- "ivitate",
- "ivitati",
- "ivit\u0103i",
- "ivit\u0103\u0163i",
- ):
- word = suffix_replace(word, suffix, "iv")
- elif suffix in (
- "icitate",
- "icitati",
- "icit\u0103i",
- "icit\u0103\u0163i",
- "icator",
- "icatori",
- "iciv",
- "iciva",
- "icive",
- "icivi",
- "iciv\u0103",
- "ical",
- "icala",
- "icale",
- "icali",
- "ical\u0103",
- ):
- word = suffix_replace(word, suffix, "ic")
- elif suffix in (
- "ativ",
- "ativa",
- "ative",
- "ativi",
- "ativ\u0103",
- "a\u0163iune",
- "atoare",
- "ator",
- "atori",
- "\u0103toare",
- "\u0103tor",
- "\u0103tori",
- ):
- word = suffix_replace(word, suffix, "at")
- if suffix in r2:
- r2 = suffix_replace(r2, suffix, "at")
- elif suffix in (
- "itiv",
- "itiva",
- "itive",
- "itivi",
- "itiv\u0103",
- "i\u0163iune",
- "itoare",
- "itor",
- "itori",
- ):
- word = suffix_replace(word, suffix, "it")
- if suffix in r2:
- r2 = suffix_replace(r2, suffix, "it")
- else:
- step1_success = False
- break
- if not replacement_done:
- break
- # STEP 2: Removal of standard suffixes
- for suffix in self.__step2_suffixes:
- if word.endswith(suffix):
- if suffix in r2:
- step2_success = True
- if suffix in ("iune", "iuni"):
- if word[-5] == "\u0163":
- word = "".join((word[:-5], "t"))
- elif suffix in (
- "ism",
- "isme",
- "ist",
- "ista",
- "iste",
- "isti",
- "ist\u0103",
- "i\u015Fti",
- ):
- word = suffix_replace(word, suffix, "ist")
- else:
- word = word[: -len(suffix)]
- break
- # STEP 3: Removal of verb suffixes
- if not step1_success and not step2_success:
- for suffix in self.__step3_suffixes:
- if word.endswith(suffix):
- if suffix in rv:
- if suffix in (
- "seser\u0103\u0163i",
- "seser\u0103m",
- "ser\u0103\u0163i",
- "sese\u015Fi",
- "seser\u0103",
- "ser\u0103m",
- "sesem",
- "se\u015Fi",
- "ser\u0103",
- "sese",
- "a\u0163i",
- "e\u0163i",
- "i\u0163i",
- "\xE2\u0163i",
- "sei",
- "\u0103m",
- "em",
- "im",
- "\xE2m",
- "se",
- ):
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
- else:
- if (
- not rv.startswith(suffix)
- and rv[rv.index(suffix) - 1] not in "aeio\u0103\xE2\xEE"
- ):
- word = word[: -len(suffix)]
- break
- # STEP 4: Removal of final vowel
- for suffix in ("ie", "a", "e", "i", "\u0103"):
- if word.endswith(suffix):
- if suffix in rv:
- word = word[: -len(suffix)]
- break
- word = word.replace("I", "i").replace("U", "u")
- return word
- class RussianStemmer(_LanguageSpecificStemmer):
- """
- The Russian Snowball stemmer.
- :cvar __perfective_gerund_suffixes: Suffixes to be deleted.
- :type __perfective_gerund_suffixes: tuple
- :cvar __adjectival_suffixes: Suffixes to be deleted.
- :type __adjectival_suffixes: tuple
- :cvar __reflexive_suffixes: Suffixes to be deleted.
- :type __reflexive_suffixes: tuple
- :cvar __verb_suffixes: Suffixes to be deleted.
- :type __verb_suffixes: tuple
- :cvar __noun_suffixes: Suffixes to be deleted.
- :type __noun_suffixes: tuple
- :cvar __superlative_suffixes: Suffixes to be deleted.
- :type __superlative_suffixes: tuple
- :cvar __derivational_suffixes: Suffixes to be deleted.
- :type __derivational_suffixes: tuple
- :note: A detailed description of the Russian
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/russian/stemmer.html
- """
- __perfective_gerund_suffixes = (
- "ivshis'",
- "yvshis'",
- "vshis'",
- "ivshi",
- "yvshi",
- "vshi",
- "iv",
- "yv",
- "v",
- )
- __adjectival_suffixes = (
- "ui^ushchi^ui^u",
- "ui^ushchi^ai^a",
- "ui^ushchimi",
- "ui^ushchymi",
- "ui^ushchego",
- "ui^ushchogo",
- "ui^ushchemu",
- "ui^ushchomu",
- "ui^ushchikh",
- "ui^ushchykh",
- "ui^ushchui^u",
- "ui^ushchaia",
- "ui^ushchoi^u",
- "ui^ushchei^u",
- "i^ushchi^ui^u",
- "i^ushchi^ai^a",
- "ui^ushchee",
- "ui^ushchie",
- "ui^ushchye",
- "ui^ushchoe",
- "ui^ushchei`",
- "ui^ushchii`",
- "ui^ushchyi`",
- "ui^ushchoi`",
- "ui^ushchem",
- "ui^ushchim",
- "ui^ushchym",
- "ui^ushchom",
- "i^ushchimi",
- "i^ushchymi",
- "i^ushchego",
- "i^ushchogo",
- "i^ushchemu",
- "i^ushchomu",
- "i^ushchikh",
- "i^ushchykh",
- "i^ushchui^u",
- "i^ushchai^a",
- "i^ushchoi^u",
- "i^ushchei^u",
- "i^ushchee",
- "i^ushchie",
- "i^ushchye",
- "i^ushchoe",
- "i^ushchei`",
- "i^ushchii`",
- "i^ushchyi`",
- "i^ushchoi`",
- "i^ushchem",
- "i^ushchim",
- "i^ushchym",
- "i^ushchom",
- "shchi^ui^u",
- "shchi^ai^a",
- "ivshi^ui^u",
- "ivshi^ai^a",
- "yvshi^ui^u",
- "yvshi^ai^a",
- "shchimi",
- "shchymi",
- "shchego",
- "shchogo",
- "shchemu",
- "shchomu",
- "shchikh",
- "shchykh",
- "shchui^u",
- "shchai^a",
- "shchoi^u",
- "shchei^u",
- "ivshimi",
- "ivshymi",
- "ivshego",
- "ivshogo",
- "ivshemu",
- "ivshomu",
- "ivshikh",
- "ivshykh",
- "ivshui^u",
- "ivshai^a",
- "ivshoi^u",
- "ivshei^u",
- "yvshimi",
- "yvshymi",
- "yvshego",
- "yvshogo",
- "yvshemu",
- "yvshomu",
- "yvshikh",
- "yvshykh",
- "yvshui^u",
- "yvshai^a",
- "yvshoi^u",
- "yvshei^u",
- "vshi^ui^u",
- "vshi^ai^a",
- "shchee",
- "shchie",
- "shchye",
- "shchoe",
- "shchei`",
- "shchii`",
- "shchyi`",
- "shchoi`",
- "shchem",
- "shchim",
- "shchym",
- "shchom",
- "ivshee",
- "ivshie",
- "ivshye",
- "ivshoe",
- "ivshei`",
- "ivshii`",
- "ivshyi`",
- "ivshoi`",
- "ivshem",
- "ivshim",
- "ivshym",
- "ivshom",
- "yvshee",
- "yvshie",
- "yvshye",
- "yvshoe",
- "yvshei`",
- "yvshii`",
- "yvshyi`",
- "yvshoi`",
- "yvshem",
- "yvshim",
- "yvshym",
- "yvshom",
- "vshimi",
- "vshymi",
- "vshego",
- "vshogo",
- "vshemu",
- "vshomu",
- "vshikh",
- "vshykh",
- "vshui^u",
- "vshai^a",
- "vshoi^u",
- "vshei^u",
- "emi^ui^u",
- "emi^ai^a",
- "nni^ui^u",
- "nni^ai^a",
- "vshee",
- "vshie",
- "vshye",
- "vshoe",
- "vshei`",
- "vshii`",
- "vshyi`",
- "vshoi`",
- "vshem",
- "vshim",
- "vshym",
- "vshom",
- "emimi",
- "emymi",
- "emego",
- "emogo",
- "ememu",
- "emomu",
- "emikh",
- "emykh",
- "emui^u",
- "emai^a",
- "emoi^u",
- "emei^u",
- "nnimi",
- "nnymi",
- "nnego",
- "nnogo",
- "nnemu",
- "nnomu",
- "nnikh",
- "nnykh",
- "nnui^u",
- "nnai^a",
- "nnoi^u",
- "nnei^u",
- "emee",
- "emie",
- "emye",
- "emoe",
- "emei`",
- "emii`",
- "emyi`",
- "emoi`",
- "emem",
- "emim",
- "emym",
- "emom",
- "nnee",
- "nnie",
- "nnye",
- "nnoe",
- "nnei`",
- "nnii`",
- "nnyi`",
- "nnoi`",
- "nnem",
- "nnim",
- "nnym",
- "nnom",
- "i^ui^u",
- "i^ai^a",
- "imi",
- "ymi",
- "ego",
- "ogo",
- "emu",
- "omu",
- "ikh",
- "ykh",
- "ui^u",
- "ai^a",
- "oi^u",
- "ei^u",
- "ee",
- "ie",
- "ye",
- "oe",
- "ei`",
- "ii`",
- "yi`",
- "oi`",
- "em",
- "im",
- "ym",
- "om",
- )
- __reflexive_suffixes = ("si^a", "s'")
- __verb_suffixes = (
- "esh'",
- "ei`te",
- "ui`te",
- "ui^ut",
- "ish'",
- "ete",
- "i`te",
- "i^ut",
- "nno",
- "ila",
- "yla",
- "ena",
- "ite",
- "ili",
- "yli",
- "ilo",
- "ylo",
- "eno",
- "i^at",
- "uet",
- "eny",
- "it'",
- "yt'",
- "ui^u",
- "la",
- "na",
- "li",
- "em",
- "lo",
- "no",
- "et",
- "ny",
- "t'",
- "ei`",
- "ui`",
- "il",
- "yl",
- "im",
- "ym",
- "en",
- "it",
- "yt",
- "i^u",
- "i`",
- "l",
- "n",
- )
- __noun_suffixes = (
- "ii^ami",
- "ii^akh",
- "i^ami",
- "ii^am",
- "i^akh",
- "ami",
- "iei`",
- "i^am",
- "iem",
- "akh",
- "ii^u",
- "'i^u",
- "ii^a",
- "'i^a",
- "ev",
- "ov",
- "ie",
- "'e",
- "ei",
- "ii",
- "ei`",
- "oi`",
- "ii`",
- "em",
- "am",
- "om",
- "i^u",
- "i^a",
- "a",
- "e",
- "i",
- "i`",
- "o",
- "u",
- "y",
- "'",
- )
- __superlative_suffixes = ("ei`she", "ei`sh")
- __derivational_suffixes = ("ost'", "ost")
- def stem(self, word):
- """
- Stem a Russian word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- if word in self.stopwords:
- return word
- chr_exceeded = False
- for i in range(len(word)):
- if ord(word[i]) > 255:
- chr_exceeded = True
- break
- if not chr_exceeded:
- return word
- word = self.__cyrillic_to_roman(word)
- step1_success = False
- adjectival_removed = False
- verb_removed = False
- undouble_success = False
- superlative_removed = False
- rv, r2 = self.__regions_russian(word)
- # Step 1
- for suffix in self.__perfective_gerund_suffixes:
- if rv.endswith(suffix):
- if suffix in ("v", "vshi", "vshis'"):
- if (
- rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
- or rv[-len(suffix) - 1 : -len(suffix)] == "a"
- ):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- step1_success = True
- break
- else:
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- step1_success = True
- break
- if not step1_success:
- for suffix in self.__reflexive_suffixes:
- if rv.endswith(suffix):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- break
- for suffix in self.__adjectival_suffixes:
- if rv.endswith(suffix):
- if suffix in (
- "i^ushchi^ui^u",
- "i^ushchi^ai^a",
- "i^ushchui^u",
- "i^ushchai^a",
- "i^ushchoi^u",
- "i^ushchei^u",
- "i^ushchimi",
- "i^ushchymi",
- "i^ushchego",
- "i^ushchogo",
- "i^ushchemu",
- "i^ushchomu",
- "i^ushchikh",
- "i^ushchykh",
- "shchi^ui^u",
- "shchi^ai^a",
- "i^ushchee",
- "i^ushchie",
- "i^ushchye",
- "i^ushchoe",
- "i^ushchei`",
- "i^ushchii`",
- "i^ushchyi`",
- "i^ushchoi`",
- "i^ushchem",
- "i^ushchim",
- "i^ushchym",
- "i^ushchom",
- "vshi^ui^u",
- "vshi^ai^a",
- "shchui^u",
- "shchai^a",
- "shchoi^u",
- "shchei^u",
- "emi^ui^u",
- "emi^ai^a",
- "nni^ui^u",
- "nni^ai^a",
- "shchimi",
- "shchymi",
- "shchego",
- "shchogo",
- "shchemu",
- "shchomu",
- "shchikh",
- "shchykh",
- "vshui^u",
- "vshai^a",
- "vshoi^u",
- "vshei^u",
- "shchee",
- "shchie",
- "shchye",
- "shchoe",
- "shchei`",
- "shchii`",
- "shchyi`",
- "shchoi`",
- "shchem",
- "shchim",
- "shchym",
- "shchom",
- "vshimi",
- "vshymi",
- "vshego",
- "vshogo",
- "vshemu",
- "vshomu",
- "vshikh",
- "vshykh",
- "emui^u",
- "emai^a",
- "emoi^u",
- "emei^u",
- "nnui^u",
- "nnai^a",
- "nnoi^u",
- "nnei^u",
- "vshee",
- "vshie",
- "vshye",
- "vshoe",
- "vshei`",
- "vshii`",
- "vshyi`",
- "vshoi`",
- "vshem",
- "vshim",
- "vshym",
- "vshom",
- "emimi",
- "emymi",
- "emego",
- "emogo",
- "ememu",
- "emomu",
- "emikh",
- "emykh",
- "nnimi",
- "nnymi",
- "nnego",
- "nnogo",
- "nnemu",
- "nnomu",
- "nnikh",
- "nnykh",
- "emee",
- "emie",
- "emye",
- "emoe",
- "emei`",
- "emii`",
- "emyi`",
- "emoi`",
- "emem",
- "emim",
- "emym",
- "emom",
- "nnee",
- "nnie",
- "nnye",
- "nnoe",
- "nnei`",
- "nnii`",
- "nnyi`",
- "nnoi`",
- "nnem",
- "nnim",
- "nnym",
- "nnom",
- ):
- if (
- rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
- or rv[-len(suffix) - 1 : -len(suffix)] == "a"
- ):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- adjectival_removed = True
- break
- else:
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- adjectival_removed = True
- break
- if not adjectival_removed:
- for suffix in self.__verb_suffixes:
- if rv.endswith(suffix):
- if suffix in (
- "la",
- "na",
- "ete",
- "i`te",
- "li",
- "i`",
- "l",
- "em",
- "n",
- "lo",
- "no",
- "et",
- "i^ut",
- "ny",
- "t'",
- "esh'",
- "nno",
- ):
- if (
- rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
- or rv[-len(suffix) - 1 : -len(suffix)] == "a"
- ):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- verb_removed = True
- break
- else:
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- verb_removed = True
- break
- if not adjectival_removed and not verb_removed:
- for suffix in self.__noun_suffixes:
- if rv.endswith(suffix):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- break
- # Step 2
- if rv.endswith("i"):
- word = word[:-1]
- r2 = r2[:-1]
- # Step 3
- for suffix in self.__derivational_suffixes:
- if r2.endswith(suffix):
- word = word[: -len(suffix)]
- break
- # Step 4
- if word.endswith("nn"):
- word = word[:-1]
- undouble_success = True
- if not undouble_success:
- for suffix in self.__superlative_suffixes:
- if word.endswith(suffix):
- word = word[: -len(suffix)]
- superlative_removed = True
- break
- if word.endswith("nn"):
- word = word[:-1]
- if not undouble_success and not superlative_removed:
- if word.endswith("'"):
- word = word[:-1]
- word = self.__roman_to_cyrillic(word)
- return word
- def __regions_russian(self, word):
- """
- Return the regions RV and R2 which are used by the Russian stemmer.
- In any word, RV is the region after the first vowel,
- or the end of the word if it contains no vowel.
- R2 is the region after the first non-vowel following
- a vowel in R1, or the end of the word if there is no such non-vowel.
- R1 is the region after the first non-vowel following a vowel,
- or the end of the word if there is no such non-vowel.
- :param word: The Russian word whose regions RV and R2 are determined.
- :type word: str or unicode
- :return: the regions RV and R2 for the respective Russian word.
- :rtype: tuple
- :note: This helper method is invoked by the stem method of the subclass
- RussianStemmer. It is not to be invoked directly!
- """
- r1 = ""
- r2 = ""
- rv = ""
- vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y")
- word = word.replace("i^a", "A").replace("i^u", "U").replace("e`", "E")
- for i in range(1, len(word)):
- if word[i] not in vowels and word[i - 1] in vowels:
- r1 = word[i + 1 :]
- break
- for i in range(1, len(r1)):
- if r1[i] not in vowels and r1[i - 1] in vowels:
- r2 = r1[i + 1 :]
- break
- for i in range(len(word)):
- if word[i] in vowels:
- rv = word[i + 1 :]
- break
- r2 = r2.replace("A", "i^a").replace("U", "i^u").replace("E", "e`")
- rv = rv.replace("A", "i^a").replace("U", "i^u").replace("E", "e`")
- return (rv, r2)
- def __cyrillic_to_roman(self, word):
- """
- Transliterate a Russian word into the Roman alphabet.
- A Russian word whose letters consist of the Cyrillic
- alphabet are transliterated into the Roman alphabet
- in order to ease the forthcoming stemming process.
- :param word: The word that is transliterated.
- :type word: unicode
- :return: the transliterated word.
- :rtype: unicode
- :note: This helper method is invoked by the stem method of the subclass
- RussianStemmer. It is not to be invoked directly!
- """
- word = (
- word.replace("\u0410", "a")
- .replace("\u0430", "a")
- .replace("\u0411", "b")
- .replace("\u0431", "b")
- .replace("\u0412", "v")
- .replace("\u0432", "v")
- .replace("\u0413", "g")
- .replace("\u0433", "g")
- .replace("\u0414", "d")
- .replace("\u0434", "d")
- .replace("\u0415", "e")
- .replace("\u0435", "e")
- .replace("\u0401", "e")
- .replace("\u0451", "e")
- .replace("\u0416", "zh")
- .replace("\u0436", "zh")
- .replace("\u0417", "z")
- .replace("\u0437", "z")
- .replace("\u0418", "i")
- .replace("\u0438", "i")
- .replace("\u0419", "i`")
- .replace("\u0439", "i`")
- .replace("\u041A", "k")
- .replace("\u043A", "k")
- .replace("\u041B", "l")
- .replace("\u043B", "l")
- .replace("\u041C", "m")
- .replace("\u043C", "m")
- .replace("\u041D", "n")
- .replace("\u043D", "n")
- .replace("\u041E", "o")
- .replace("\u043E", "o")
- .replace("\u041F", "p")
- .replace("\u043F", "p")
- .replace("\u0420", "r")
- .replace("\u0440", "r")
- .replace("\u0421", "s")
- .replace("\u0441", "s")
- .replace("\u0422", "t")
- .replace("\u0442", "t")
- .replace("\u0423", "u")
- .replace("\u0443", "u")
- .replace("\u0424", "f")
- .replace("\u0444", "f")
- .replace("\u0425", "kh")
- .replace("\u0445", "kh")
- .replace("\u0426", "t^s")
- .replace("\u0446", "t^s")
- .replace("\u0427", "ch")
- .replace("\u0447", "ch")
- .replace("\u0428", "sh")
- .replace("\u0448", "sh")
- .replace("\u0429", "shch")
- .replace("\u0449", "shch")
- .replace("\u042A", "''")
- .replace("\u044A", "''")
- .replace("\u042B", "y")
- .replace("\u044B", "y")
- .replace("\u042C", "'")
- .replace("\u044C", "'")
- .replace("\u042D", "e`")
- .replace("\u044D", "e`")
- .replace("\u042E", "i^u")
- .replace("\u044E", "i^u")
- .replace("\u042F", "i^a")
- .replace("\u044F", "i^a")
- )
- return word
- def __roman_to_cyrillic(self, word):
- """
- Transliterate a Russian word back into the Cyrillic alphabet.
- A Russian word formerly transliterated into the Roman alphabet
- in order to ease the stemming process, is transliterated back
- into the Cyrillic alphabet, its original form.
- :param word: The word that is transliterated.
- :type word: str or unicode
- :return: word, the transliterated word.
- :rtype: unicode
- :note: This helper method is invoked by the stem method of the subclass
- RussianStemmer. It is not to be invoked directly!
- """
- word = (
- word.replace("i^u", "\u044E")
- .replace("i^a", "\u044F")
- .replace("shch", "\u0449")
- .replace("kh", "\u0445")
- .replace("t^s", "\u0446")
- .replace("ch", "\u0447")
- .replace("e`", "\u044D")
- .replace("i`", "\u0439")
- .replace("sh", "\u0448")
- .replace("k", "\u043A")
- .replace("e", "\u0435")
- .replace("zh", "\u0436")
- .replace("a", "\u0430")
- .replace("b", "\u0431")
- .replace("v", "\u0432")
- .replace("g", "\u0433")
- .replace("d", "\u0434")
- .replace("e", "\u0435")
- .replace("z", "\u0437")
- .replace("i", "\u0438")
- .replace("l", "\u043B")
- .replace("m", "\u043C")
- .replace("n", "\u043D")
- .replace("o", "\u043E")
- .replace("p", "\u043F")
- .replace("r", "\u0440")
- .replace("s", "\u0441")
- .replace("t", "\u0442")
- .replace("u", "\u0443")
- .replace("f", "\u0444")
- .replace("''", "\u044A")
- .replace("y", "\u044B")
- .replace("'", "\u044C")
- )
- return word
- class SpanishStemmer(_StandardStemmer):
- """
- The Spanish Snowball stemmer.
- :cvar __vowels: The Spanish vowels.
- :type __vowels: unicode
- :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
- :type __step0_suffixes: tuple
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
- :type __step2a_suffixes: tuple
- :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
- :type __step2b_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :note: A detailed description of the Spanish
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/spanish/stemmer.html
- """
- __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xFC"
- __step0_suffixes = (
- "selas",
- "selos",
- "sela",
- "selo",
- "las",
- "les",
- "los",
- "nos",
- "me",
- "se",
- "la",
- "le",
- "lo",
- )
- __step1_suffixes = (
- "amientos",
- "imientos",
- "amiento",
- "imiento",
- "aciones",
- "uciones",
- "adoras",
- "adores",
- "ancias",
- "log\xEDas",
- "encias",
- "amente",
- "idades",
- "anzas",
- "ismos",
- "ables",
- "ibles",
- "istas",
- "adora",
- "aci\xF3n",
- "antes",
- "ancia",
- "log\xEDa",
- "uci\xf3n",
- "encia",
- "mente",
- "anza",
- "icos",
- "icas",
- "ismo",
- "able",
- "ible",
- "ista",
- "osos",
- "osas",
- "ador",
- "ante",
- "idad",
- "ivas",
- "ivos",
- "ico",
- "ica",
- "oso",
- "osa",
- "iva",
- "ivo",
- )
- __step2a_suffixes = (
- "yeron",
- "yendo",
- "yamos",
- "yais",
- "yan",
- "yen",
- "yas",
- "yes",
- "ya",
- "ye",
- "yo",
- "y\xF3",
- )
- __step2b_suffixes = (
- "ar\xEDamos",
- "er\xEDamos",
- "ir\xEDamos",
- "i\xE9ramos",
- "i\xE9semos",
- "ar\xEDais",
- "aremos",
- "er\xEDais",
- "eremos",
- "ir\xEDais",
- "iremos",
- "ierais",
- "ieseis",
- "asteis",
- "isteis",
- "\xE1bamos",
- "\xE1ramos",
- "\xE1semos",
- "ar\xEDan",
- "ar\xEDas",
- "ar\xE9is",
- "er\xEDan",
- "er\xEDas",
- "er\xE9is",
- "ir\xEDan",
- "ir\xEDas",
- "ir\xE9is",
- "ieran",
- "iesen",
- "ieron",
- "iendo",
- "ieras",
- "ieses",
- "abais",
- "arais",
- "aseis",
- "\xE9amos",
- "ar\xE1n",
- "ar\xE1s",
- "ar\xEDa",
- "er\xE1n",
- "er\xE1s",
- "er\xEDa",
- "ir\xE1n",
- "ir\xE1s",
- "ir\xEDa",
- "iera",
- "iese",
- "aste",
- "iste",
- "aban",
- "aran",
- "asen",
- "aron",
- "ando",
- "abas",
- "adas",
- "idas",
- "aras",
- "ases",
- "\xEDais",
- "ados",
- "idos",
- "amos",
- "imos",
- "emos",
- "ar\xE1",
- "ar\xE9",
- "er\xE1",
- "er\xE9",
- "ir\xE1",
- "ir\xE9",
- "aba",
- "ada",
- "ida",
- "ara",
- "ase",
- "\xEDan",
- "ado",
- "ido",
- "\xEDas",
- "\xE1is",
- "\xE9is",
- "\xEDa",
- "ad",
- "ed",
- "id",
- "an",
- "i\xF3",
- "ar",
- "er",
- "ir",
- "as",
- "\xEDs",
- "en",
- "es",
- )
- __step3_suffixes = ("os", "a", "e", "o", "\xE1", "\xE9", "\xED", "\xF3")
- def stem(self, word):
- """
- Stem a Spanish word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.stopwords:
- return word
- step1_success = False
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- rv = self._rv_standard(word, self.__vowels)
- # STEP 0: Attached pronoun
- for suffix in self.__step0_suffixes:
- if not (word.endswith(suffix) and rv.endswith(suffix)):
- continue
- if (
- rv[: -len(suffix)].endswith(
- (
- "ando",
- "\xE1ndo",
- "ar",
- "\xE1r",
- "er",
- "\xE9r",
- "iendo",
- "i\xE9ndo",
- "ir",
- "\xEDr",
- )
- )
- ) or (
- rv[: -len(suffix)].endswith("yendo")
- and word[: -len(suffix)].endswith("uyendo")
- ):
- word = self.__replace_accented(word[: -len(suffix)])
- r1 = self.__replace_accented(r1[: -len(suffix)])
- r2 = self.__replace_accented(r2[: -len(suffix)])
- rv = self.__replace_accented(rv[: -len(suffix)])
- break
- # STEP 1: Standard suffix removal
- for suffix in self.__step1_suffixes:
- if not word.endswith(suffix):
- continue
- if suffix == "amente" and r1.endswith(suffix):
- step1_success = True
- word = word[:-6]
- r2 = r2[:-6]
- rv = rv[:-6]
- if r2.endswith("iv"):
- word = word[:-2]
- r2 = r2[:-2]
- rv = rv[:-2]
- if r2.endswith("at"):
- word = word[:-2]
- rv = rv[:-2]
- elif r2.endswith(("os", "ic", "ad")):
- word = word[:-2]
- rv = rv[:-2]
- elif r2.endswith(suffix):
- step1_success = True
- if suffix in (
- "adora",
- "ador",
- "aci\xF3n",
- "adoras",
- "adores",
- "aciones",
- "ante",
- "antes",
- "ancia",
- "ancias",
- ):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- if r2.endswith("ic"):
- word = word[:-2]
- rv = rv[:-2]
- elif suffix in ("log\xEDa", "log\xEDas"):
- word = suffix_replace(word, suffix, "log")
- rv = suffix_replace(rv, suffix, "log")
- elif suffix in ("uci\xF3n", "uciones"):
- word = suffix_replace(word, suffix, "u")
- rv = suffix_replace(rv, suffix, "u")
- elif suffix in ("encia", "encias"):
- word = suffix_replace(word, suffix, "ente")
- rv = suffix_replace(rv, suffix, "ente")
- elif suffix == "mente":
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- if r2.endswith(("ante", "able", "ible")):
- word = word[:-4]
- rv = rv[:-4]
- elif suffix in ("idad", "idades"):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- for pre_suff in ("abil", "ic", "iv"):
- if r2.endswith(pre_suff):
- word = word[: -len(pre_suff)]
- rv = rv[: -len(pre_suff)]
- elif suffix in ("ivo", "iva", "ivos", "ivas"):
- word = word[: -len(suffix)]
- r2 = r2[: -len(suffix)]
- rv = rv[: -len(suffix)]
- if r2.endswith("at"):
- word = word[:-2]
- rv = rv[:-2]
- else:
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
- break
- # STEP 2a: Verb suffixes beginning 'y'
- if not step1_success:
- for suffix in self.__step2a_suffixes:
- if rv.endswith(suffix) and word[-len(suffix) - 1 : -len(suffix)] == "u":
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
- break
- # STEP 2b: Other verb suffixes
- for suffix in self.__step2b_suffixes:
- if rv.endswith(suffix):
- word = word[: -len(suffix)]
- rv = rv[: -len(suffix)]
- if suffix in ("en", "es", "\xE9is", "emos"):
- if word.endswith("gu"):
- word = word[:-1]
- if rv.endswith("gu"):
- rv = rv[:-1]
- break
- # STEP 3: Residual suffix
- for suffix in self.__step3_suffixes:
- if rv.endswith(suffix):
- word = word[: -len(suffix)]
- if suffix in ("e", "\xE9"):
- rv = rv[: -len(suffix)]
- if word[-2:] == "gu" and rv.endswith("u"):
- word = word[:-1]
- break
- word = self.__replace_accented(word)
- return word
- def __replace_accented(self, word):
- """
- Replaces all accented letters on a word with their non-accented
- counterparts.
- :param word: A spanish word, with or without accents
- :type word: str or unicode
- :return: a word with the accented letters (á, é, í, ó, ú) replaced with
- their non-accented counterparts (a, e, i, o, u)
- :rtype: str or unicode
- """
- return (
- word.replace("\xE1", "a")
- .replace("\xE9", "e")
- .replace("\xED", "i")
- .replace("\xF3", "o")
- .replace("\xFA", "u")
- )
- class SwedishStemmer(_ScandinavianStemmer):
- """
- The Swedish Snowball stemmer.
- :cvar __vowels: The Swedish vowels.
- :type __vowels: unicode
- :cvar __s_ending: Letters that may directly appear before a word final 's'.
- :type __s_ending: unicode
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :note: A detailed description of the Swedish
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/swedish/stemmer.html
- """
- __vowels = "aeiouy\xE4\xE5\xF6"
- __s_ending = "bcdfghjklmnoprtvy"
- __step1_suffixes = (
- "heterna",
- "hetens",
- "heter",
- "heten",
- "anden",
- "arnas",
- "ernas",
- "ornas",
- "andes",
- "andet",
- "arens",
- "arna",
- "erna",
- "orna",
- "ande",
- "arne",
- "aste",
- "aren",
- "ades",
- "erns",
- "ade",
- "are",
- "ern",
- "ens",
- "het",
- "ast",
- "ad",
- "en",
- "ar",
- "er",
- "or",
- "as",
- "es",
- "at",
- "a",
- "e",
- "s",
- )
- __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt")
- __step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig")
- def stem(self, word):
- """
- Stem a Swedish word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.stopwords:
- return word
- r1 = self._r1_scandinavian(word, self.__vowels)
- # STEP 1
- for suffix in self.__step1_suffixes:
- if r1.endswith(suffix):
- if suffix == "s":
- if word[-2] in self.__s_ending:
- word = word[:-1]
- r1 = r1[:-1]
- else:
- word = word[: -len(suffix)]
- r1 = r1[: -len(suffix)]
- break
- # STEP 2
- for suffix in self.__step2_suffixes:
- if r1.endswith(suffix):
- word = word[:-1]
- r1 = r1[:-1]
- break
- # STEP 3
- for suffix in self.__step3_suffixes:
- if r1.endswith(suffix):
- if suffix in ("els", "lig", "ig"):
- word = word[: -len(suffix)]
- elif suffix in ("fullt", "l\xF6st"):
- word = word[:-1]
- break
- return word
- def demo():
- """
- This function provides a demonstration of the Snowball stemmers.
- After invoking this function and specifying a language,
- it stems an excerpt of the Universal Declaration of Human Rights
- (which is a part of the NLTK corpus collection) and then prints
- out the original and the stemmed text.
- """
- from nltk.corpus import udhr
- udhr_corpus = {
- "arabic": "Arabic_Alarabia-Arabic",
- "danish": "Danish_Dansk-Latin1",
- "dutch": "Dutch_Nederlands-Latin1",
- "english": "English-Latin1",
- "finnish": "Finnish_Suomi-Latin1",
- "french": "French_Francais-Latin1",
- "german": "German_Deutsch-Latin1",
- "hungarian": "Hungarian_Magyar-UTF8",
- "italian": "Italian_Italiano-Latin1",
- "norwegian": "Norwegian-Latin1",
- "porter": "English-Latin1",
- "portuguese": "Portuguese_Portugues-Latin1",
- "romanian": "Romanian_Romana-Latin2",
- "russian": "Russian-UTF8",
- "spanish": "Spanish-Latin1",
- "swedish": "Swedish_Svenska-Latin1",
- }
- print("\n")
- print("******************************")
- print("Demo for the Snowball stemmers")
- print("******************************")
- while True:
- language = input(
- "Please enter the name of the language "
- + "to be demonstrated\n"
- + "/".join(SnowballStemmer.languages)
- + "\n"
- + "(enter 'exit' in order to leave): "
- )
- if language == "exit":
- break
- if language not in SnowballStemmer.languages:
- print(
- (
- "\nOops, there is no stemmer for this language. "
- + "Please try again.\n"
- )
- )
- continue
- stemmer = SnowballStemmer(language)
- excerpt = udhr.words(udhr_corpus[language])[:300]
- stemmed = " ".join(stemmer.stem(word) for word in excerpt)
- stemmed = re.sub(r"(.{,70})\s", r"\1\n", stemmed + " ").rstrip()
- excerpt = " ".join(excerpt)
- excerpt = re.sub(r"(.{,70})\s", r"\1\n", excerpt + " ").rstrip()
- print("\n")
- print("-" * 70)
- print("ORIGINAL".center(70))
- print(excerpt)
- print("\n\n")
- print("STEMMED RESULTS".center(70))
- print(stemmed)
- print("-" * 70)
- print("\n")
|