bo-graduation/nltk-book/pattern-master/pattern/graph/commonsense.py

#### PATTERN | COMMONSENSE #########################################################################
# Copyright (c) 2010 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).
# http://www.clips.ua.ac.be/pages/pattern

####################################################################################################

from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals
from __future__ import division

from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range

from codecs import BOM_UTF8
from itertools import chain
from functools import cmp_to_key

from io import open

try:
    # Python 2
    from urllib import urlopen
except ImportError:
    # Python 3
    from urllib.request import urlopen

from .__init__ import Graph, Node, Edge, bfs
from .__init__ import WEIGHT, CENTRALITY, EIGENVECTOR, BETWEENNESS

import os
import sys

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

if sys.version > "3":
    BOM_UTF8 = str(BOM_UTF8.decode("utf-8"))
else:
    BOM_UTF8 = BOM_UTF8.decode("utf-8")

#### COMMONSENSE SEMANTIC NETWORK ##################################################################

#--- CONCEPT ---------------------------------------------------------------------------------------


class Concept(Node):

    def __init__(self, *args, **kwargs):
        """ A concept in the sematic network.
        """
        Node.__init__(self, *args, **kwargs)
        self._properties = None

    @property
    def halo(self, depth=2):
        """ Returns the concept halo: a list with this concept + surrounding concepts.
            This is useful to reason more fluidly about the concept,
            since the halo will include latent properties linked to nearby concepts.
        """
        return self.flatten(depth=depth)

    @property
    def properties(self):
        """ Returns the top properties in the concept halo, sorted by betweenness centrality.
            The return value is a list of concept id's instead of Concepts (for performance).
        """
        if self._properties is None:
            g = self.graph.copy(nodes=self.halo)
            p = (n for n in g.nodes if n.id in self.graph.properties)
            p = [n.id for n in reversed(sorted(p, key=lambda n: n.centrality))]
            self._properties = p
        return self._properties


def halo(concept, depth=2):
    return concept.flatten(depth=depth)


def properties(concept, depth=2, centrality=BETWEENNESS):
    g = concept.graph.copy(nodes=halo(concept, depth))
    p = (n for n in g.nodes if n.id in concept.graph.properties)
    p = [n.id for n in reversed(sorted(p, key=lambda n: getattr(n, centrality)))]
    return p

#--- RELATION --------------------------------------------------------------------------------------


class Relation(Edge):

    def __init__(self, *args, **kwargs):
        """ A relation between two concepts, with an optional context.
            For example, "Felix is-a cat" is in the "media" context, "tiger is-a cat" in "nature".
        """
        self.context = kwargs.pop("context", None)
        Edge.__init__(self, *args, **kwargs)

#--- HEURISTICS ------------------------------------------------------------------------------------
# Similarity between concepts is measured using a featural approach:
# a comparison of the features/properties that are salient in each concept's halo.
# Commonsense.similarity() takes an optional "heuristic" parameter to tweak this behavior.
# It is a tuple of two functions:
# 1) function(concept) returns a list of salient properties (or other),
# 2) function(concept1, concept2) returns the cost to traverse this edge (0.0-1.0).

COMMONALITY = (
    # Similarity heuristic that only traverses relations between properties.
    lambda concept: concept.properties,
    lambda edge: 1 - int(edge.context == "properties" and \
                         edge.type != "is-opposite-of"))

#--- COMMONSENSE -----------------------------------------------------------------------------------


class Commonsense(Graph):

    def __init__(self, data=os.path.join(MODULE, "commonsense.csv"), **kwargs):
        """ A semantic network of commonsense, using different relation types:
            - is-a,
            - is-part-of,
            - is-opposite-of,
            - is-property-of,
            - is-related-to,
            - is-same-as,
            - is-effect-of.
        """
        Graph.__init__(self, **kwargs)
        self._properties = None
        # Load data from the given path,
        # a CSV-file of (concept1, relation, concept2, context, weight)-items.
        if data is not None:
            s = open(data, encoding = 'utf-8').read()
            s = s.strip(BOM_UTF8)
            s = ((v.strip("\"") for v in r.split(",")) for r in s.splitlines())
            for concept1, relation, concept2, context, weight in s:
                self.add_edge(concept1, concept2,
                    type = relation,
                 context = context,
                  weight = min(int(weight) * 0.1, 1.0))

    @property
    def concepts(self):
        return self.nodes

    @property
    def relations(self):
        return self.edges

    @property
    def properties(self):
        """ Yields all concepts that are properties (i.e., adjectives).
            For example: "cold is-property-of winter" => "cold".
        """
        if self._properties is None:
            #self._properties = set(e.node1.id for e in self.edges if e.type == "is-property-of")
            self._properties = (e for e in self.edges if e.context == "properties")
            self._properties = set(chain(*((e.node1.id, e.node2.id) for e in self._properties)))
        return self._properties

    def add_node(self, id, *args, **kwargs):
        """ Returns a Concept (Node subclass).
        """
        self._properties = None
        kwargs.setdefault("base", Concept)
        return Graph.add_node(self, id, *args, **kwargs)

    def add_edge(self, id1, id2, *args, **kwargs):
        """ Returns a Relation between two concepts (Edge subclass).
        """
        self._properties = None
        kwargs.setdefault("base", Relation)
        return Graph.add_edge(self, id1, id2, *args, **kwargs)

    def remove(self, x):
        self._properties = None
        Graph.remove(self, x)

    def similarity(self, concept1, concept2, k=3, heuristic=COMMONALITY):
        """ Returns the similarity of the given concepts,
            by cross-comparing shortest path distance between k concept properties.
            A given concept can also be a flat list of properties, e.g. ["creepy"].
            The given heuristic is a tuple of two functions:
            1) function(concept) returns a list of salient properties,
            2) function(edge) returns the cost for traversing this edge (0.0-1.0).
        """
        if isinstance(concept1, str):
            concept1 = self[concept1]
        if isinstance(concept2, str):
            concept2 = self[concept2]
        if isinstance(concept1, Node):
            concept1 = heuristic[0](concept1)
        if isinstance(concept2, Node):
            concept2 = heuristic[0](concept2)
        if isinstance(concept1, list):
            concept1 = [isinstance(n, Node) and n or self[n] for n in concept1]
        if isinstance(concept2, list):
            concept2 = [isinstance(n, Node) and n or self[n] for n in concept2]
        h = lambda id1, id2: heuristic[1](self.edge(id1, id2))
        w = 0.0
        for p1 in concept1[:k]:
            for p2 in concept2[:k]:
                p = self.shortest_path(p1, p2, heuristic=h)
                w += 1.0 / (p is None and 1e10 or len(p))
        return w / k

    def nearest_neighbors(self, concept, concepts=[], k=3):
        """ Returns the k most similar concepts from the given list.
        """
        return sorted(concepts, key=lambda candidate: self.similarity(concept, candidate, k), reverse=True)

    similar = neighbors = nn = nearest_neighbors

    def taxonomy(self, concept, depth=3, fringe=2):
        """ Returns a list of concepts that are descendants of the given concept, using "is-a" relations.
            Creates a subgraph of "is-a" related concepts up to the given depth,
            then takes the fringe (i.e., leaves) of the subgraph.
        """
        def traversable(node, edge):
            # Follow parent-child edges.
            return edge.node2 == node and edge.type == "is-a"
        if not isinstance(concept, Node):
            concept = self[concept]
        g = self.copy(nodes=concept.flatten(depth, traversable))
        g = g.fringe(depth=fringe)
        g = [self[n.id] for n in g if n != concept]
        return g

    field = semantic_field = taxonomy

#g = Commonsense()
#print(g.nn("party", g.field("animal")))
#print(g.nn("creepy", g.field("animal")))

#### COMMONSENSE DATA ##############################################################################

#--- NODEBOX.NET/PERCEPTION ------------------------------------------------------------------------


def download(path=os.path.join(MODULE, "commonsense.csv"), threshold=50):
    """ Downloads commonsense data from http://nodebox.net/perception.
        Saves the data as commonsense.csv which can be the input for Commonsense.load().
    """
    s = "http://nodebox.net/perception?format=txt&robots=1"
    s = urlopen(s).read()
    s = s.decode("utf-8")
    s = s.replace("\\'", "'")
    # Group relations by author.
    a = {}
    for r in ([v.strip("'") for v in r.split(", ")] for r in s.split("\n")):
        if len(r) == 7:
            a.setdefault(r[-2], []).append(r)
    # Iterate authors sorted by number of contributions.
    # 1) Authors with 50+ contributions can define new relations and context.
    # 2) Authors with 50- contributions (or robots) can only reinforce existing relations.
    a = sorted(a.items(), key=cmp_to_key(lambda v1, v2: len(v2[1]) - len(v1[1])))
    r = {}
    for author, relations in a:
        if author == "" or author.startswith("robots@"):
            continue
        if len(relations) < threshold:
            break
        # Sort latest-first (we prefer more recent relation types).
        relations = sorted(relations, key=cmp_to_key(lambda r1, r2: r1[-1] > r2[-1]))
        # 1) Define new relations.
        for concept1, relation, concept2, context, weight, author, date in relations:
            id = (concept1, relation, concept2)
            if id not in r:
                r[id] = [None, 0]
            if r[id][0] is None and context is not None:
                r[id][0] = context
    for author, relations in a:
        # 2) Reinforce existing relations.
        for concept1, relation, concept2, context, weight, author, date in relations:
            id = (concept1, relation, concept2)
            if id in r:
                r[id][1] += int(weight)
    # Export CSV-file.
    s = []
    for (concept1, relation, concept2), (context, weight) in r.items():
        s.append("\"%s\",\"%s\",\"%s\",\"%s\",%s" % (
            concept1, relation, concept2, context, weight))
    f = open(path, "w", encoding = 'utf-8')
    f.write(BOM_UTF8)
    f.write("\n".join(s))
    f.close()


def json():
    """ Returns a JSON-string with the data from commonsense.csv.
        Each relation is encoded as a [concept1, relation, concept2, context, weight] list.
    """
    f = lambda s: s.replace("'", "\\'").encode("utf-8")
    s = []
    g = Commonsense()
    for e in g.edges:
        s.append("\n\t['%s', '%s', '%s', '%s', %.2f]" % (
            f(e.node1.id),
            f(e.type),
            f(e.node2.id),
            f(e.context),
              e.weight
        ))
    return "commonsense = [%s];" % ", ".join(s)

#download("commonsense.csv", threshold=50)
#open("commonsense.js", "w", encoding = 'utf-8').write(json())