You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

396 lines
16 KiB
Python

# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to MaltParser
#
# Author: Dan Garrette <dhgarrette@gmail.com>
# Contributor: Liling Tan, Mustufain, osamamukhtar11
#
# Copyright (C) 2001-2020 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import os
import sys
import tempfile
import subprocess
import inspect
from nltk.data import ZipFilePathPointer
from nltk.internals import find_dir, find_file, find_jars_within_path
from nltk.parse.api import ParserI
from nltk.parse.dependencygraph import DependencyGraph
from nltk.parse.util import taggedsents_to_conll
def malt_regex_tagger():
from nltk.tag import RegexpTagger
_tagger = RegexpTagger(
[
(r"\.$", "."),
(r"\,$", ","),
(r"\?$", "?"), # fullstop, comma, Qmark
(r"\($", "("),
(r"\)$", ")"), # round brackets
(r"\[$", "["),
(r"\]$", "]"), # square brackets
(r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
(r"(The|the|A|a|An|an)$", "DT"), # articles
(r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns
(r"(His|his|Her|her|Its|its)$", "PRP$"), # possesive
(r"(my|Your|your|Yours|yours)$", "PRP$"), # possesive
(r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions
(r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions
(r"(till|Till|until|Until)$", "IN"), # time prepopsitions
(r"(by|By|beside|Beside)$", "IN"), # space prepopsitions
(r"(under|Under|below|Below)$", "IN"), # space prepopsitions
(r"(over|Over|above|Above)$", "IN"), # space prepopsitions
(r"(across|Across|through|Through)$", "IN"), # space prepopsitions
(r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions
(r"(onto|Onto|from|From)$", "IN"), # space prepopsitions
(r".*able$", "JJ"), # adjectives
(r".*ness$", "NN"), # nouns formed from adjectives
(r".*ly$", "RB"), # adverbs
(r".*s$", "NNS"), # plural nouns
(r".*ing$", "VBG"), # gerunds
(r".*ed$", "VBD"), # past tense verbs
(r".*", "NN"), # nouns (default)
]
)
return _tagger.tag
def find_maltparser(parser_dirname):
"""
A module to find MaltParser .jar file and its dependencies.
"""
if os.path.exists(parser_dirname): # If a full path is given.
_malt_dir = parser_dirname
else: # Try to find path to maltparser directory in environment variables.
_malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",))
# Checks that that the found directory contains all the necessary .jar
malt_dependencies = ["", "", ""]
_malt_jars = set(find_jars_within_path(_malt_dir))
_jars = set(os.path.split(jar)[1] for jar in _malt_jars)
malt_dependencies = set(["log4j.jar", "libsvm.jar", "liblinear-1.8.jar"])
assert malt_dependencies.issubset(_jars)
assert any(
filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars)
)
return list(_malt_jars)
def find_malt_model(model_filename):
"""
A module to find pre-trained MaltParser model.
"""
if model_filename is None:
return "malt_temp.mco"
elif os.path.exists(model_filename): # If a full path is given.
return model_filename
else: # Try to find path to malt model in environment variables.
return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False)
class MaltParser(ParserI):
"""
A class for dependency parsing with MaltParser. The input is the paths to:
- a maltparser directory
- (optionally) the path to a pre-trained MaltParser .mco model file
- (optionally) the tagger to use for POS tagging before parsing
- (optionally) additional Java arguments
Example:
>>> from nltk.parse import malt
>>> # With MALT_PARSER and MALT_MODEL environment set.
>>> mp = malt.MaltParser('maltparser-1.7.2', 'engmalt.linear-1.7.mco') # doctest: +SKIP
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
(shot I (elephant an) (in (pajamas my)) .)
>>> # Without MALT_PARSER and MALT_MODEL environment.
>>> mp = malt.MaltParser('/home/user/maltparser-1.7.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
(shot I (elephant an) (in (pajamas my)) .)
"""
def __init__(
self,
parser_dirname,
model_filename=None,
tagger=None,
additional_java_args=None,
):
"""
An interface for parsing with the Malt Parser.
:param parser_dirname: The path to the maltparser directory that
contains the maltparser-1.x.jar
:type parser_dirname: str
:param model_filename: The name of the pre-trained model with .mco file
extension. If provided, training will not be required.
(see http://www.maltparser.org/mco/mco.html and
see http://www.patful.com/chalk/node/185)
:type model_filename: str
:param tagger: The tagger used to POS tag the raw string before
formatting to CONLL format. It should behave like `nltk.pos_tag`
:type tagger: function
:param additional_java_args: This is the additional Java arguments that
one can use when calling Maltparser, usually this is the heapsize
limits, e.g. `additional_java_args=['-Xmx1024m']`
(see http://goo.gl/mpDBvQ)
:type additional_java_args: list
"""
# Find all the necessary jar files for MaltParser.
self.malt_jars = find_maltparser(parser_dirname)
# Initialize additional java arguments.
self.additional_java_args = (
additional_java_args if additional_java_args is not None else []
)
# Initialize model.
self.model = find_malt_model(model_filename)
self._trained = self.model != "malt_temp.mco"
# Set the working_dir parameters i.e. `-w` from MaltParser's option.
self.working_dir = tempfile.gettempdir()
# Initialize POS tagger.
self.tagger = tagger if tagger is not None else malt_regex_tagger()
def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"):
"""
Use MaltParser to parse multiple POS tagged sentences. Takes multiple
sentences where each sentence is a list of (word, tag) tuples.
The sentences must have already been tokenized and tagged.
:param sentences: Input sentences to parse
:type sentence: list(list(tuple(str, str)))
:return: iter(iter(``DependencyGraph``)) the dependency graph
representation of each sentence
"""
if not self._trained:
raise Exception("Parser has not been trained. Call train() first.")
with tempfile.NamedTemporaryFile(
prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False
) as input_file:
with tempfile.NamedTemporaryFile(
prefix="malt_output.conll.",
dir=self.working_dir,
mode="w",
delete=False,
) as output_file:
# Convert list of sentences to CONLL format.
for line in taggedsents_to_conll(sentences):
input_file.write(str(line))
input_file.close()
# Generate command to run maltparser.
cmd = self.generate_malt_command(
input_file.name, output_file.name, mode="parse"
)
# This is a maltparser quirk, it needs to be run
# where the model file is. otherwise it goes into an awkward
# missing .jars or strange -w working_dir problem.
_current_path = os.getcwd() # Remembers the current path.
try: # Change to modelfile path
os.chdir(os.path.split(self.model)[0])
except:
pass
ret = self._execute(cmd, verbose) # Run command.
os.chdir(_current_path) # Change back to current path.
if ret != 0:
raise Exception(
"MaltParser parsing (%s) failed with exit "
"code %d" % (" ".join(cmd), ret)
)
# Must return iter(iter(Tree))
with open(output_file.name) as infile:
for tree_str in infile.read().split("\n\n"):
yield (
iter(
[
DependencyGraph(
tree_str, top_relation_label=top_relation_label
)
]
)
)
os.remove(input_file.name)
os.remove(output_file.name)
def parse_sents(self, sentences, verbose=False, top_relation_label="null"):
"""
Use MaltParser to parse multiple sentences.
Takes a list of sentences, where each sentence is a list of words.
Each sentence will be automatically tagged with this
MaltParser instance's tagger.
:param sentences: Input sentences to parse
:type sentence: list(list(str))
:return: iter(DependencyGraph)
"""
tagged_sentences = (self.tagger(sentence) for sentence in sentences)
return self.parse_tagged_sents(
tagged_sentences, verbose, top_relation_label=top_relation_label
)
def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
"""
This function generates the maltparser command use at the terminal.
:param inputfilename: path to the input file
:type inputfilename: str
:param outputfilename: path to the output file
:type outputfilename: str
"""
cmd = ["java"]
cmd += self.additional_java_args # Adds additional java arguments
# Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
classpaths_separator = ";" if sys.platform.startswith("win") else ":"
cmd += [
"-cp",
classpaths_separator.join(self.malt_jars),
] # Adds classpaths for jars
cmd += ["org.maltparser.Malt"] # Adds the main function.
# Adds the model file.
if os.path.exists(self.model): # when parsing
cmd += ["-c", os.path.split(self.model)[-1]]
else: # when learning
cmd += ["-c", self.model]
cmd += ["-i", inputfilename]
if mode == "parse":
cmd += ["-o", outputfilename]
cmd += ["-m", mode] # mode use to generate parses.
return cmd
@staticmethod
def _execute(cmd, verbose=False):
output = None if verbose else subprocess.PIPE
p = subprocess.Popen(cmd, stdout=output, stderr=output)
return p.wait()
def train(self, depgraphs, verbose=False):
"""
Train MaltParser from a list of ``DependencyGraph`` objects
:param depgraphs: list of ``DependencyGraph`` objects for training input data
:type depgraphs: DependencyGraph
"""
# Write the conll_str to malt_train.conll file in /tmp/
with tempfile.NamedTemporaryFile(
prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
) as input_file:
input_str = "\n".join(dg.to_conll(10) for dg in depgraphs)
input_file.write(str(input_str))
# Trains the model with the malt_train.conll
self.train_from_file(input_file.name, verbose=verbose)
# Removes the malt_train.conll once training finishes.
os.remove(input_file.name)
def train_from_file(self, conll_file, verbose=False):
"""
Train MaltParser from a file
:param conll_file: str for the filename of the training input data
:type conll_file: str
"""
# If conll_file is a ZipFilePathPointer,
# then we need to do some extra massaging
if isinstance(conll_file, ZipFilePathPointer):
with tempfile.NamedTemporaryFile(
prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
) as input_file:
with conll_file.open() as conll_input_file:
conll_str = conll_input_file.read()
input_file.write(str(conll_str))
return self.train_from_file(input_file.name, verbose=verbose)
# Generate command to run maltparser.
cmd = self.generate_malt_command(conll_file, mode="learn")
ret = self._execute(cmd, verbose)
if ret != 0:
raise Exception(
"MaltParser training (%s) failed with exit "
"code %d" % (" ".join(cmd), ret)
)
self._trained = True
if __name__ == '__main__':
"""
A demonstration function to show how NLTK users can use the malt parser API.
>>> from nltk import pos_tag
>>> assert 'MALT_PARSER' in os.environ, str(
... "Please set MALT_PARSER in your global environment, e.g.:\n"
... "$ export MALT_PARSER='/home/user/maltparser-1.7.2/'")
>>>
>>> assert 'MALT_MODEL' in os.environ, str(
... "Please set MALT_MODEL in your global environment, e.g.:\n"
... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
>>>
>>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
... "2 sees _ VB _ _ 0 ROOT _ _\n"
... "3 a _ DT _ _ 4 SPEC _ _\n"
... "4 dog _ NN _ _ 2 OBJ _ _\n"
... "5 . _ . _ _ 2 PUNCT _ _\n")
>>>
>>>
>>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
... "2 walks _ VB _ _ 0 ROOT _ _\n"
... "3 . _ . _ _ 2 PUNCT _ _\n")
>>> dg1 = DependencyGraph(_dg1_str)
>>> dg2 = DependencyGraph(_dg2_str)
>>> # Initialize a MaltParser object
>>> parser_dirname = 'maltparser-1.7.2'
>>> mp = MaltParser(parser_dirname=parser_dirname)
>>>
>>> # Trains a model.
>>> mp.train([dg1,dg2], verbose=False)
>>> sent1 = ['John','sees','Mary', '.']
>>> sent2 = ['John', 'walks', 'a', 'dog', '.']
>>>
>>> # Parse a single sentence.
>>> parsed_sent1 = mp.parse_one(sent1)
>>> parsed_sent2 = mp.parse_one(sent2)
>>> print(parsed_sent1.tree())
(sees John Mary .)
>>> print(parsed_sent2.tree())
(walks John (dog a) .)
>>>
>>> # Parsing multiple sentences.
>>> sentences = [sent1,sent2]
>>> parsed_sents = mp.parse_sents(sentences)
>>> print(next(next(parsed_sents)).tree())
(sees John Mary .)
>>> print(next(next(parsed_sents)).tree())
(walks John (dog a) .)
>>>
>>> # Initialize a MaltParser object with an English pre-trained model.
>>> parser_dirname = 'maltparser-1.7.2'
>>> model_name = 'engmalt.linear-1.7.mco'
>>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
>>> sent1 = 'I shot an elephant in my pajamas .'.split()
>>> sent2 = 'Time flies like banana .'.split()
>>> # Parse a single sentence.
>>> print(mp.parse_one(sent1).tree())
(shot I (elephant an) (in (pajamas my)) .)
# Parsing multiple sentences
>>> sentences = [sent1,sent2]
>>> parsed_sents = mp.parse_sents(sentences)
>>> print(next(next(parsed_sents)).tree())
(shot I (elephant an) (in (pajamas my)) .)
>>> print(next(next(parsed_sents)).tree())
(flies Time (like banana) .)
"""
import doctest
doctest.testmod()