gbrault
/
jupytersketcher
mirror of https://github.com/gbrault/jupytersketcher.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
							# Natural Language Toolkit: Interface to TADM Classifier
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Joseph Frazee <jfrazee@mail.utexas.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

import sys
import subprocess

from nltk.internals import find_binary

try:
    import numpy
except ImportError:
    pass

_tadm_bin = None


def config_tadm(bin=None):
    global _tadm_bin
    _tadm_bin = find_binary(
        "tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net"
    )


def write_tadm_file(train_toks, encoding, stream):
    """
    Generate an input file for ``tadm`` based on the given corpus of
    classified tokens.

    :type train_toks: list(tuple(dict, str))
    :param train_toks: Training data, represented as a list of
        pairs, the first member of which is a feature dictionary,
        and the second of which is a classification label.
    :type encoding: TadmEventMaxentFeatureEncoding
    :param encoding: A feature encoding, used to convert featuresets
        into feature vectors.
    :type stream: stream
    :param stream: The stream to which the ``tadm`` input file should be
        written.
    """
    # See the following for a file format description:
    #
    # http://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054
    # http://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054
    labels = encoding.labels()
    for featureset, label in train_toks:
        length_line = "%d\n" % len(labels)
        stream.write(length_line)
        for known_label in labels:
            v = encoding.encode(featureset, known_label)
            line = "%d %d %s\n" % (
                int(label == known_label),
                len(v),
                " ".join("%d %d" % u for u in v),
            )
            stream.write(line)


def parse_tadm_weights(paramfile):
    """
    Given the stdout output generated by ``tadm`` when training a
    model, return a ``numpy`` array containing the corresponding weight
    vector.
    """
    weights = []
    for line in paramfile:
        weights.append(float(line.strip()))
    return numpy.array(weights, "d")


def call_tadm(args):
    """
    Call the ``tadm`` binary with the given arguments.
    """
    if isinstance(args, str):
        raise TypeError("args should be a list of strings")
    if _tadm_bin is None:
        config_tadm()

    # Call tadm via a subprocess
    cmd = [_tadm_bin] + args
    p = subprocess.Popen(cmd, stdout=sys.stdout)
    (stdout, stderr) = p.communicate()

    # Check the return code.
    if p.returncode != 0:
        print()
        print(stderr)
        raise OSError("tadm command failed!")


def names_demo():
    from nltk.classify.util import names_demo
    from nltk.classify.maxent import TadmMaxentClassifier

    classifier = names_demo(TadmMaxentClassifier.train)


def encoding_demo():
    import sys
    from nltk.classify.maxent import TadmEventMaxentFeatureEncoding

    tokens = [
        ({"f0": 1, "f1": 1, "f3": 1}, "A"),
        ({"f0": 1, "f2": 1, "f4": 1}, "B"),
        ({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"),
    ]
    encoding = TadmEventMaxentFeatureEncoding.train(tokens)
    write_tadm_file(tokens, encoding, sys.stdout)
    print()
    for i in range(encoding.length()):
        print("%s --> %d" % (encoding.describe(i), i))
    print()


if __name__ == "__main__":
    encoding_demo()
    names_demo()