You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

157 lines
4.8 KiB
Python

# -*- coding: utf-8 *-*
from __future__ import print_function
from __future__ import unicode_literals
from builtins import str, bytes, dict, int
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.web import DBPedia
dbp = DBPedia()
# DBPedia is a database of structured information mined from Wikipedia.
# DBPedia data is stored as RDF triples: (subject, predicate, object),
# e.g., X is-a Actor, Y is-a Country, Z has-birthplace Country, ...
# If you know about pattern.graph (or graphs in general),
# this triple format should look familiar.
# DBPedia can be queried using SPARQL:
# http://dbpedia.org/sparql
# http://www.w3.org/TR/rdf-sparql-query/
# A SPARQL query yields rows that match all triples in the WHERE clause.
# A SPARQL query uses ?wildcards in triple subject/object to select fields.
# 1) Search DBPedia for actors.
# Variables are indicated with a "?" prefix.
# Variables will be bound to the corresponding part of each matched triple.
# The "a" is short for "is of the class".
# The "prefix" statement creates a shorthand for a given namespace.
# To see what semantic constraints are available in "dbo" (for example):
# http://dbpedia.org/ontology/
q = """
prefix dbo: <http://dbpedia.org/ontology/>
select ?actor where {
?actor a dbo:Actor.
}
"""
for result in dbp.search(q, start=1, count=10):
print(result.actor)
print("")
# You may notice that each Result.actor is of the form:
# "http://dbpedia.org/resource/[NAME]"
# This kind of string is a subclass of unicode: DBPediaResource.
# DBPediaResource has a DBPediaResource.name property (see below).
# 2) Search DBPedia for actors and their place of birth.
q = """
prefix dbo: <http://dbpedia.org/ontology/>
select ?actor ?place where {
?actor a dbo:Actor.
?actor dbo:birthPlace ?place.
}
order by ?actor
"""
for r in dbp.search(q, start=1, count=10):
print("%s (%s)" % (r.actor.name, r.place.name))
print("")
# You will notice that the results now include duplicates,
# the same actor with a city name, and with a country name.
# We could refine ?place by including the following triple:
# "?place a dbo:Country."
# 3) Search DBPedia for actors born in 1970.
# Each result must match both triples, i.e.,
# X is an actor + X is born on Y.
# We don't want to filter by month and day (e.g., "1970-12-31"),
# so we use a regular expression instead with filter():
q = """
prefix dbo: <http://dbpedia.org/ontology/>
select ?actor ?date where {
?actor a dbo:Actor.
?actor dbo:birthDate ?date.
filter(regex(str(?date), "1970-..-.."))
}
order by ?date
"""
for r in dbp.search(q, start=1, count=10):
print("%s (%s)" % (r.actor.name, r.date))
print("")
# We could also make this query shorter,
# by combining the two ?actor triples into one:
# "?actor a dbo:Actor; dbo:birthDate ?date."
# 4) A more advanced example, in German:
q = """
prefix dbo: <http://dbpedia.org/ontology/>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
select ?actor ?place where {
?_actor a dbo:Actor.
?_actor dbo:birthPlace ?_place.
?_actor rdfs:label ?actor.
?_place rdfs:label ?place.
filter(lang(?actor) = "de" && lang(?place) = "de")
}
order by ?actor
"""
for r in dbp.search(q, start=1, count=10):
print("%s (%s)" % (r.actor, r.place))
print("")
# This extracts a German label for each matched DBPedia resource.
# - X is an actor,
# - X is born in Y,
# - X has a German label A,
# - Y has a German label B,
# - Retrieve A and B.
# For example, say one of the matched resources was:
# "<http://dbpedia.org/page/Erwin_Schrödinger>"
# If you open this URL in a browser,
# you will see all the available semantic properties and their values.
# One of the properties is "rdfs:label": a human-readable & multilingual label.
# 5) Find triples involving cats.
# <http://purl.org/dc/terms/subject>
# means: "is in the category of".
q = """
prefix dbo: <http://dbpedia.org/ontology/>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
select ?cat ?relation ?concept where {
?cat <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:Felis>.
?cat ?_relation ?_concept.
?_relation rdfs:label ?relation.
?_concept rdfs:label ?concept.
filter(lang(?relation) = "en" && lang(?concept) = "en")
} order by ?cat
"""
for r in dbp.search(q, start=1, count=10):
print("%s ---%s--> %s" % (r.cat.name, r.relation.ljust(10, "-"), r.concept))
print("")
# 6) People whose first name includes "Édouard"
q = """
prefix dbo: <http://dbpedia.org/ontology/>
prefix foaf: <http://xmlns.com/foaf/0.1/>
select ?person ?name where {
?person a dbo:Person.
?person foaf:givenName ?name.
filter(regex(?name, "Édouard"))
}
"""
for result in dbp.search(q, start=1, count=10, cached=False):
print("%s (%s)" % (result.person.name, result.name))
print("")