You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

312 lines
12 KiB
Python

#### PATTERN | COMMONSENSE #########################################################################
# Copyright (c) 2010 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).
# http://www.clips.ua.ac.be/pages/pattern
####################################################################################################
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals
from __future__ import division
from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range
from codecs import BOM_UTF8
from itertools import chain
from functools import cmp_to_key
from io import open
try:
# Python 2
from urllib import urlopen
except ImportError:
# Python 3
from urllib.request import urlopen
from .__init__ import Graph, Node, Edge, bfs
from .__init__ import WEIGHT, CENTRALITY, EIGENVECTOR, BETWEENNESS
import os
import sys
try:
MODULE = os.path.dirname(os.path.realpath(__file__))
except:
MODULE = ""
if sys.version > "3":
BOM_UTF8 = str(BOM_UTF8.decode("utf-8"))
else:
BOM_UTF8 = BOM_UTF8.decode("utf-8")
#### COMMONSENSE SEMANTIC NETWORK ##################################################################
#--- CONCEPT ---------------------------------------------------------------------------------------
class Concept(Node):
def __init__(self, *args, **kwargs):
""" A concept in the sematic network.
"""
Node.__init__(self, *args, **kwargs)
self._properties = None
@property
def halo(self, depth=2):
""" Returns the concept halo: a list with this concept + surrounding concepts.
This is useful to reason more fluidly about the concept,
since the halo will include latent properties linked to nearby concepts.
"""
return self.flatten(depth=depth)
@property
def properties(self):
""" Returns the top properties in the concept halo, sorted by betweenness centrality.
The return value is a list of concept id's instead of Concepts (for performance).
"""
if self._properties is None:
g = self.graph.copy(nodes=self.halo)
p = (n for n in g.nodes if n.id in self.graph.properties)
p = [n.id for n in reversed(sorted(p, key=lambda n: n.centrality))]
self._properties = p
return self._properties
def halo(concept, depth=2):
return concept.flatten(depth=depth)
def properties(concept, depth=2, centrality=BETWEENNESS):
g = concept.graph.copy(nodes=halo(concept, depth))
p = (n for n in g.nodes if n.id in concept.graph.properties)
p = [n.id for n in reversed(sorted(p, key=lambda n: getattr(n, centrality)))]
return p
#--- RELATION --------------------------------------------------------------------------------------
class Relation(Edge):
def __init__(self, *args, **kwargs):
""" A relation between two concepts, with an optional context.
For example, "Felix is-a cat" is in the "media" context, "tiger is-a cat" in "nature".
"""
self.context = kwargs.pop("context", None)
Edge.__init__(self, *args, **kwargs)
#--- HEURISTICS ------------------------------------------------------------------------------------
# Similarity between concepts is measured using a featural approach:
# a comparison of the features/properties that are salient in each concept's halo.
# Commonsense.similarity() takes an optional "heuristic" parameter to tweak this behavior.
# It is a tuple of two functions:
# 1) function(concept) returns a list of salient properties (or other),
# 2) function(concept1, concept2) returns the cost to traverse this edge (0.0-1.0).
COMMONALITY = (
# Similarity heuristic that only traverses relations between properties.
lambda concept: concept.properties,
lambda edge: 1 - int(edge.context == "properties" and \
edge.type != "is-opposite-of"))
#--- COMMONSENSE -----------------------------------------------------------------------------------
class Commonsense(Graph):
def __init__(self, data=os.path.join(MODULE, "commonsense.csv"), **kwargs):
""" A semantic network of commonsense, using different relation types:
- is-a,
- is-part-of,
- is-opposite-of,
- is-property-of,
- is-related-to,
- is-same-as,
- is-effect-of.
"""
Graph.__init__(self, **kwargs)
self._properties = None
# Load data from the given path,
# a CSV-file of (concept1, relation, concept2, context, weight)-items.
if data is not None:
s = open(data, encoding = 'utf-8').read()
s = s.strip(BOM_UTF8)
s = ((v.strip("\"") for v in r.split(",")) for r in s.splitlines())
for concept1, relation, concept2, context, weight in s:
self.add_edge(concept1, concept2,
type = relation,
context = context,
weight = min(int(weight) * 0.1, 1.0))
@property
def concepts(self):
return self.nodes
@property
def relations(self):
return self.edges
@property
def properties(self):
""" Yields all concepts that are properties (i.e., adjectives).
For example: "cold is-property-of winter" => "cold".
"""
if self._properties is None:
#self._properties = set(e.node1.id for e in self.edges if e.type == "is-property-of")
self._properties = (e for e in self.edges if e.context == "properties")
self._properties = set(chain(*((e.node1.id, e.node2.id) for e in self._properties)))
return self._properties
def add_node(self, id, *args, **kwargs):
""" Returns a Concept (Node subclass).
"""
self._properties = None
kwargs.setdefault("base", Concept)
return Graph.add_node(self, id, *args, **kwargs)
def add_edge(self, id1, id2, *args, **kwargs):
""" Returns a Relation between two concepts (Edge subclass).
"""
self._properties = None
kwargs.setdefault("base", Relation)
return Graph.add_edge(self, id1, id2, *args, **kwargs)
def remove(self, x):
self._properties = None
Graph.remove(self, x)
def similarity(self, concept1, concept2, k=3, heuristic=COMMONALITY):
""" Returns the similarity of the given concepts,
by cross-comparing shortest path distance between k concept properties.
A given concept can also be a flat list of properties, e.g. ["creepy"].
The given heuristic is a tuple of two functions:
1) function(concept) returns a list of salient properties,
2) function(edge) returns the cost for traversing this edge (0.0-1.0).
"""
if isinstance(concept1, str):
concept1 = self[concept1]
if isinstance(concept2, str):
concept2 = self[concept2]
if isinstance(concept1, Node):
concept1 = heuristic[0](concept1)
if isinstance(concept2, Node):
concept2 = heuristic[0](concept2)
if isinstance(concept1, list):
concept1 = [isinstance(n, Node) and n or self[n] for n in concept1]
if isinstance(concept2, list):
concept2 = [isinstance(n, Node) and n or self[n] for n in concept2]
h = lambda id1, id2: heuristic[1](self.edge(id1, id2))
w = 0.0
for p1 in concept1[:k]:
for p2 in concept2[:k]:
p = self.shortest_path(p1, p2, heuristic=h)
w += 1.0 / (p is None and 1e10 or len(p))
return w / k
def nearest_neighbors(self, concept, concepts=[], k=3):
""" Returns the k most similar concepts from the given list.
"""
return sorted(concepts, key=lambda candidate: self.similarity(concept, candidate, k), reverse=True)
similar = neighbors = nn = nearest_neighbors
def taxonomy(self, concept, depth=3, fringe=2):
""" Returns a list of concepts that are descendants of the given concept, using "is-a" relations.
Creates a subgraph of "is-a" related concepts up to the given depth,
then takes the fringe (i.e., leaves) of the subgraph.
"""
def traversable(node, edge):
# Follow parent-child edges.
return edge.node2 == node and edge.type == "is-a"
if not isinstance(concept, Node):
concept = self[concept]
g = self.copy(nodes=concept.flatten(depth, traversable))
g = g.fringe(depth=fringe)
g = [self[n.id] for n in g if n != concept]
return g
field = semantic_field = taxonomy
#g = Commonsense()
#print(g.nn("party", g.field("animal")))
#print(g.nn("creepy", g.field("animal")))
#### COMMONSENSE DATA ##############################################################################
#--- NODEBOX.NET/PERCEPTION ------------------------------------------------------------------------
def download(path=os.path.join(MODULE, "commonsense.csv"), threshold=50):
""" Downloads commonsense data from http://nodebox.net/perception.
Saves the data as commonsense.csv which can be the input for Commonsense.load().
"""
s = "http://nodebox.net/perception?format=txt&robots=1"
s = urlopen(s).read()
s = s.decode("utf-8")
s = s.replace("\\'", "'")
# Group relations by author.
a = {}
for r in ([v.strip("'") for v in r.split(", ")] for r in s.split("\n")):
if len(r) == 7:
a.setdefault(r[-2], []).append(r)
# Iterate authors sorted by number of contributions.
# 1) Authors with 50+ contributions can define new relations and context.
# 2) Authors with 50- contributions (or robots) can only reinforce existing relations.
a = sorted(a.items(), key=cmp_to_key(lambda v1, v2: len(v2[1]) - len(v1[1])))
r = {}
for author, relations in a:
if author == "" or author.startswith("robots@"):
continue
if len(relations) < threshold:
break
# Sort latest-first (we prefer more recent relation types).
relations = sorted(relations, key=cmp_to_key(lambda r1, r2: r1[-1] > r2[-1]))
# 1) Define new relations.
for concept1, relation, concept2, context, weight, author, date in relations:
id = (concept1, relation, concept2)
if id not in r:
r[id] = [None, 0]
if r[id][0] is None and context is not None:
r[id][0] = context
for author, relations in a:
# 2) Reinforce existing relations.
for concept1, relation, concept2, context, weight, author, date in relations:
id = (concept1, relation, concept2)
if id in r:
r[id][1] += int(weight)
# Export CSV-file.
s = []
for (concept1, relation, concept2), (context, weight) in r.items():
s.append("\"%s\",\"%s\",\"%s\",\"%s\",%s" % (
concept1, relation, concept2, context, weight))
f = open(path, "w", encoding = 'utf-8')
f.write(BOM_UTF8)
f.write("\n".join(s))
f.close()
def json():
""" Returns a JSON-string with the data from commonsense.csv.
Each relation is encoded as a [concept1, relation, concept2, context, weight] list.
"""
f = lambda s: s.replace("'", "\\'").encode("utf-8")
s = []
g = Commonsense()
for e in g.edges:
s.append("\n\t['%s', '%s', '%s', '%s', %.2f]" % (
f(e.node1.id),
f(e.type),
f(e.node2.id),
f(e.context),
e.weight
))
return "commonsense = [%s];" % ", ".join(s)
#download("commonsense.csv", threshold=50)
#open("commonsense.js", "w", encoding = 'utf-8').write(json())