bo-graduation/venv/lib/python3.7/site-packages/nltk/parse/malt.py

# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to MaltParser
#
# Author: Dan Garrette <dhgarrette@gmail.com>
# Contributor: Liling Tan, Mustufain, osamamukhtar11
#
# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

from __future__ import print_function, unicode_literals

import os
import sys
import tempfile
import subprocess
import inspect

from six import text_type

from nltk.data import ZipFilePathPointer
from nltk.internals import find_dir, find_file, find_jars_within_path

from nltk.parse.api import ParserI
from nltk.parse.dependencygraph import DependencyGraph
from nltk.parse.util import taggedsents_to_conll


def malt_regex_tagger():
    from nltk.tag import RegexpTagger

    _tagger = RegexpTagger(
        [
            (r'\.$', '.'),
            (r'\,$', ','),
            (r'\?$', '?'),  # fullstop, comma, Qmark
            (r'\($', '('),
            (r'\)$', ')'),  # round brackets
            (r'\[$', '['),
            (r'\]$', ']'),  # square brackets
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'(The|the|A|a|An|an)$', 'DT'),  # articles
            (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'),  # pronouns
            (r'(His|his|Her|her|Its|its)$', 'PRP$'),  # possesive
            (r'(my|Your|your|Yours|yours)$', 'PRP$'),  # possesive
            (r'(on|On|in|In|at|At|since|Since)$', 'IN'),  # time prepopsitions
            (r'(for|For|ago|Ago|before|Before)$', 'IN'),  # time prepopsitions
            (r'(till|Till|until|Until)$', 'IN'),  # time prepopsitions
            (r'(by|By|beside|Beside)$', 'IN'),  # space prepopsitions
            (r'(under|Under|below|Below)$', 'IN'),  # space prepopsitions
            (r'(over|Over|above|Above)$', 'IN'),  # space prepopsitions
            (r'(across|Across|through|Through)$', 'IN'),  # space prepopsitions
            (r'(into|Into|towards|Towards)$', 'IN'),  # space prepopsitions
            (r'(onto|Onto|from|From)$', 'IN'),  # space prepopsitions
            (r'.*able$', 'JJ'),  # adjectives
            (r'.*ness$', 'NN'),  # nouns formed from adjectives
            (r'.*ly$', 'RB'),  # adverbs
            (r'.*s$', 'NNS'),  # plural nouns
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # past tense verbs
            (r'.*', 'NN'),  # nouns (default)
        ]
    )
    return _tagger.tag


def find_maltparser(parser_dirname):
    """
    A module to find MaltParser .jar file and its dependencies.
    """
    if os.path.exists(parser_dirname):  # If a full path is given.
        _malt_dir = parser_dirname
    else:  # Try to find path to maltparser directory in environment variables.
        _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
    # Checks that that the found directory contains all the necessary .jar
    malt_dependencies = ['', '', '']
    _malt_jars = set(find_jars_within_path(_malt_dir))
    _jars = set(os.path.split(jar)[1] for jar in _malt_jars)
    malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])

    assert malt_dependencies.issubset(_jars)
    assert any(
        filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars)
    )
    return list(_malt_jars)


def find_malt_model(model_filename):
    """
    A module to find pre-trained MaltParser model.
    """
    if model_filename is None:
        return 'malt_temp.mco'
    elif os.path.exists(model_filename):  # If a full path is given.
        return model_filename
    else:  # Try to find path to malt model in environment variables.
        return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False)


class MaltParser(ParserI):
    """
    A class for dependency parsing with MaltParser. The input is the paths to:
    - a maltparser directory
    - (optionally) the path to a pre-trained MaltParser .mco model file
    - (optionally) the tagger to use for POS tagging before parsing
    - (optionally) additional Java arguments

    Example:
        >>> from nltk.parse import malt
        >>> # With MALT_PARSER and MALT_MODEL environment set.
        >>> mp = malt.MaltParser('maltparser-1.7.2', 'engmalt.linear-1.7.mco') # doctest: +SKIP
        >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
        (shot I (elephant an) (in (pajamas my)) .)
        >>> # Without MALT_PARSER and MALT_MODEL environment.
        >>> mp = malt.MaltParser('/home/user/maltparser-1.7.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
        >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
        (shot I (elephant an) (in (pajamas my)) .)
    """

    def __init__(
        self,
        parser_dirname,
        model_filename=None,
        tagger=None,
        additional_java_args=None,
    ):
        """
        An interface for parsing with the Malt Parser.

        :param parser_dirname: The path to the maltparser directory that
        contains the maltparser-1.x.jar
        :type parser_dirname: str
        :param model_filename: The name of the pre-trained model with .mco file
        extension. If provided, training will not be required.
        (see http://www.maltparser.org/mco/mco.html and
        see http://www.patful.com/chalk/node/185)
        :type model_filename: str
        :param tagger: The tagger used to POS tag the raw string before
        formatting to CONLL format. It should behave like `nltk.pos_tag`
        :type tagger: function
        :param additional_java_args: This is the additional Java arguments that
        one can use when calling Maltparser, usually this is the heapsize
        limits, e.g. `additional_java_args=['-Xmx1024m']`
        (see http://goo.gl/mpDBvQ)
        :type additional_java_args: list
        """

        # Find all the necessary jar files for MaltParser.
        self.malt_jars = find_maltparser(parser_dirname)
        # Initialize additional java arguments.
        self.additional_java_args = (
            additional_java_args if additional_java_args is not None else []
        )
        # Initialize model.
        self.model = find_malt_model(model_filename)
        self._trained = self.model != 'malt_temp.mco'
        # Set the working_dir parameters i.e. `-w` from MaltParser's option.
        self.working_dir = tempfile.gettempdir()
        # Initialize POS tagger.
        self.tagger = tagger if tagger is not None else malt_regex_tagger()

    def parse_tagged_sents(self, sentences, verbose=False, top_relation_label='null'):
        """
        Use MaltParser to parse multiple POS tagged sentences. Takes multiple
        sentences where each sentence is a list of (word, tag) tuples.
        The sentences must have already been tokenized and tagged.

        :param sentences: Input sentences to parse
        :type sentence: list(list(tuple(str, str)))
        :return: iter(iter(``DependencyGraph``)) the dependency graph
        representation of each sentence
        """
        if not self._trained:
            raise Exception("Parser has not been trained. Call train() first.")

        with tempfile.NamedTemporaryFile(
            prefix='malt_input.conll.', dir=self.working_dir, mode='w', delete=False
        ) as input_file:
            with tempfile.NamedTemporaryFile(
                prefix='malt_output.conll.',
                dir=self.working_dir,
                mode='w',
                delete=False,
            ) as output_file:
                # Convert list of sentences to CONLL format.
                for line in taggedsents_to_conll(sentences):
                    input_file.write(text_type(line))
                input_file.close()

                # Generate command to run maltparser.
                cmd = self.generate_malt_command(
                    input_file.name, output_file.name, mode="parse"
                )

                # This is a maltparser quirk, it needs to be run
                # where the model file is. otherwise it goes into an awkward
                # missing .jars or strange -w working_dir problem.
                _current_path = os.getcwd()  # Remembers the current path.
                try:  # Change to modelfile path
                    os.chdir(os.path.split(self.model)[0])
                except:
                    pass
                ret = self._execute(cmd, verbose)  # Run command.
                os.chdir(_current_path)  # Change back to current path.

                if ret is not 0:
                    raise Exception(
                        "MaltParser parsing (%s) failed with exit "
                        "code %d" % (' '.join(cmd), ret)
                    )

                # Must return iter(iter(Tree))
                with open(output_file.name) as infile:
                    for tree_str in infile.read().split('\n\n'):
                        yield (
                            iter(
                                [
                                    DependencyGraph(
                                        tree_str, top_relation_label=top_relation_label
                                    )
                                ]
                            )
                        )

        os.remove(input_file.name)
        os.remove(output_file.name)

    def parse_sents(self, sentences, verbose=False, top_relation_label='null'):
        """
        Use MaltParser to parse multiple sentences.
        Takes a list of sentences, where each sentence is a list of words.
        Each sentence will be automatically tagged with this
        MaltParser instance's tagger.

        :param sentences: Input sentences to parse
        :type sentence: list(list(str))
        :return: iter(DependencyGraph)
        """
        tagged_sentences = (self.tagger(sentence) for sentence in sentences)
        return self.parse_tagged_sents(
            tagged_sentences, verbose, top_relation_label=top_relation_label
        )

    def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
        """
        This function generates the maltparser command use at the terminal.

        :param inputfilename: path to the input file
        :type inputfilename: str
        :param outputfilename: path to the output file
        :type outputfilename: str
        """

        cmd = ['java']
        cmd += self.additional_java_args  # Adds additional java arguments
        # Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
        classpaths_separator = ';' if sys.platform.startswith('win') else ':'
        cmd += [
            '-cp',
            classpaths_separator.join(self.malt_jars),
        ]  # Adds classpaths for jars
        cmd += ['org.maltparser.Malt']  # Adds the main function.

        # Adds the model file.
        if os.path.exists(self.model):  # when parsing
            cmd += ['-c', os.path.split(self.model)[-1]]
        else:  # when learning
            cmd += ['-c', self.model]

        cmd += ['-i', inputfilename]
        if mode == 'parse':
            cmd += ['-o', outputfilename]
        cmd += ['-m', mode]  # mode use to generate parses.
        return cmd

    @staticmethod
    def _execute(cmd, verbose=False):
        output = None if verbose else subprocess.PIPE
        p = subprocess.Popen(cmd, stdout=output, stderr=output)
        return p.wait()

    def train(self, depgraphs, verbose=False):
        """
        Train MaltParser from a list of ``DependencyGraph`` objects

        :param depgraphs: list of ``DependencyGraph`` objects for training input data
        :type depgraphs: DependencyGraph
        """

        # Write the conll_str to malt_train.conll file in /tmp/
        with tempfile.NamedTemporaryFile(
            prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False
        ) as input_file:
            input_str = '\n'.join(dg.to_conll(10) for dg in depgraphs)
            input_file.write(text_type(input_str))
        # Trains the model with the malt_train.conll
        self.train_from_file(input_file.name, verbose=verbose)
        # Removes the malt_train.conll once training finishes.
        os.remove(input_file.name)

    def train_from_file(self, conll_file, verbose=False):
        """
        Train MaltParser from a file
        :param conll_file: str for the filename of the training input data
        :type conll_file: str
        """

        # If conll_file is a ZipFilePathPointer,
        # then we need to do some extra massaging
        if isinstance(conll_file, ZipFilePathPointer):
            with tempfile.NamedTemporaryFile(
                prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False
            ) as input_file:
                with conll_file.open() as conll_input_file:
                    conll_str = conll_input_file.read()
                    input_file.write(text_type(conll_str))
                return self.train_from_file(input_file.name, verbose=verbose)

        # Generate command to run maltparser.
        cmd = self.generate_malt_command(conll_file, mode="learn")
        ret = self._execute(cmd, verbose)
        if ret != 0:
            raise Exception(
                "MaltParser training (%s) failed with exit "
                "code %d" % (' '.join(cmd), ret)
            )
        self._trained = True


if __name__ == '__main__':
    '''
    A demonstration function to show how NLTK users can use the malt parser API.

    >>> from nltk import pos_tag
    >>> assert 'MALT_PARSER' in os.environ, str(
    ... "Please set MALT_PARSER in your global environment, e.g.:\n"
    ... "$ export MALT_PARSER='/home/user/maltparser-1.7.2/'")
    >>>
    >>> assert 'MALT_MODEL' in os.environ, str(
    ... "Please set MALT_MODEL in your global environment, e.g.:\n"
    ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
    >>>
    >>> _dg1_str = str("1    John    _    NNP   _    _    2    SUBJ    _    _\n"
    ...             "2    sees    _    VB    _    _    0    ROOT    _    _\n"
    ...             "3    a       _    DT    _    _    4    SPEC    _    _\n"
    ...             "4    dog     _    NN    _    _    2    OBJ     _    _\n"
    ...             "5    .     _    .    _    _    2    PUNCT     _    _\n")
    >>>
    >>>
    >>> _dg2_str  = str("1    John    _    NNP   _    _    2    SUBJ    _    _\n"
    ...             "2    walks   _    VB    _    _    0    ROOT    _    _\n"
    ...             "3    .     _    .    _    _    2    PUNCT     _    _\n")
    >>> dg1 = DependencyGraph(_dg1_str)
    >>> dg2 = DependencyGraph(_dg2_str)
    >>> # Initialize a MaltParser object
    >>> parser_dirname = 'maltparser-1.7.2'
    >>> mp = MaltParser(parser_dirname=parser_dirname)
    >>>
    >>> # Trains a model.
    >>> mp.train([dg1,dg2], verbose=False)
    >>> sent1 = ['John','sees','Mary', '.']
    >>> sent2 = ['John', 'walks', 'a', 'dog', '.']
    >>>
    >>> # Parse a single sentence.
    >>> parsed_sent1 = mp.parse_one(sent1)
    >>> parsed_sent2 = mp.parse_one(sent2)
    >>> print (parsed_sent1.tree())
    (sees John Mary .)
    >>> print (parsed_sent2.tree())
    (walks John (dog a) .)
    >>>
    >>> # Parsing multiple sentences.
    >>> sentences = [sent1,sent2]
    >>> parsed_sents = mp.parse_sents(sentences)
    >>> print(next(next(parsed_sents)).tree())
    (sees John Mary .)
    >>> print(next(next(parsed_sents)).tree())
    (walks John (dog a) .)
    >>>
    >>> # Initialize a MaltParser object with an English pre-trained model.
    >>> parser_dirname = 'maltparser-1.7.2'
    >>> model_name = 'engmalt.linear-1.7.mco'
    >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
    >>> sent1 = 'I shot an elephant in my pajamas .'.split()
    >>> sent2 = 'Time flies like banana .'.split()
    >>> # Parse a single sentence.
    >>> print(mp.parse_one(sent1).tree())
    (shot I (elephant an) (in (pajamas my)) .)
    # Parsing multiple sentences
    >>> sentences = [sent1,sent2]
    >>> parsed_sents = mp.parse_sents(sentences)
    >>> print(next(next(parsed_sents)).tree())
    (shot I (elephant an) (in (pajamas my)) .)
    >>> print(next(next(parsed_sents)).tree())
    (flies Time (like banana) .)
    '''
    import doctest

    doctest.testmod()