gbrault
/
jupytersketcher
дзеркало https://github.com/gbrault/jupytersketcher.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
							# -*- coding: utf-8 -*-

import os
import unittest
from contextlib import closing

from nltk import data
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer


class SnowballTest(unittest.TestCase):
    def test_arabic(self):
        """
        this unit testing for test the snowball arabic light stemmer
        this stemmer deals with prefixes and suffixes
        """
        # Test where the ignore_stopwords=True.
        ar_stemmer = SnowballStemmer("arabic", True)
        assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
        assert ar_stemmer.stem("العربية") == "عرب"
        assert ar_stemmer.stem("فقالوا") == "قال"
        assert ar_stemmer.stem("الطالبات") == "طالب"
        assert ar_stemmer.stem("فالطالبات") == "طالب"
        assert ar_stemmer.stem("والطالبات") == "طالب"
        assert ar_stemmer.stem("الطالبون") == "طالب"
        assert ar_stemmer.stem("اللذان") == "اللذان"
        assert ar_stemmer.stem("من") == "من"
        # Test where the ignore_stopwords=False.
        ar_stemmer = SnowballStemmer("arabic", False)
        assert ar_stemmer.stem("اللذان") == "اللذ"  # this is a stop word
        assert ar_stemmer.stem("الطالبات") == "طالب"
        assert ar_stemmer.stem("الكلمات") == "كلم"
        # test where create the arabic stemmer without given init value to ignore_stopwords
        ar_stemmer = SnowballStemmer("arabic")
        assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
        assert ar_stemmer.stem("العربية") == "عرب"
        assert ar_stemmer.stem("فقالوا") == "قال"
        assert ar_stemmer.stem("الطالبات") == "طالب"
        assert ar_stemmer.stem("الكلمات") == "كلم"

    def test_russian(self):
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"

    def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen'

    def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu'

    def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y'


class PorterTest(unittest.TestCase):
    def _vocabulary(self):
        with closing(
            data.find('stemmers/porter_test/porter_vocabulary.txt').open(
                encoding='utf-8'
            )
        ) as fp:
            return fp.read().splitlines()

    def _test_against_expected_output(self, stemmer_mode, expected_stems):
        stemmer = PorterStemmer(mode=stemmer_mode)
        for word, true_stem in zip(self._vocabulary(), expected_stems):
            our_stem = stemmer.stem(word)
            assert our_stem == true_stem, (
                "%s should stem to %s in %s mode but got %s"
                % (word, true_stem, stemmer_mode, our_stem)
            )

    def test_vocabulary_martin_mode(self):
        """Tests all words from the test vocabulary provided by M Porter

        The sample vocabulary and output were sourced from:
            http://tartarus.org/martin/PorterStemmer/voc.txt
            http://tartarus.org/martin/PorterStemmer/output.txt
        and are linked to from the Porter Stemmer algorithm's homepage
        at
            http://tartarus.org/martin/PorterStemmer/
        """
        with closing(
            data.find('stemmers/porter_test/porter_martin_output.txt').open(
                encoding='utf-8'
            )
        ) as fp:
            self._test_against_expected_output(
                PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
            )

    def test_vocabulary_nltk_mode(self):
        with closing(
            data.find('stemmers/porter_test/porter_nltk_output.txt').open(
                encoding='utf-8'
            )
        ) as fp:
            self._test_against_expected_output(
                PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
            )

    def test_vocabulary_original_mode(self):
        # The list of stems for this test was generated by taking the
        # Martin-blessed stemmer from
        # http://tartarus.org/martin/PorterStemmer/c.txt
        # and removing all the --DEPARTURE-- sections from it and
        # running it against Martin's test vocabulary.

        with closing(
            data.find('stemmers/porter_test/porter_original_output.txt').open(
                encoding='utf-8'
            )
        ) as fp:
            self._test_against_expected_output(
                PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
            )

        self._test_against_expected_output(
            PorterStemmer.ORIGINAL_ALGORITHM,
            data.find('stemmers/porter_test/porter_original_output.txt')
            .open(encoding='utf-8')
            .read()
            .splitlines(),
        )

    def test_oed_bug(self):
        """Test for bug https://github.com/nltk/nltk/issues/1581

        Ensures that 'oed' can be stemmed without throwing an error.
        """
        assert PorterStemmer().stem('oed') == 'o'