You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
41 lines
1.3 KiB
Python
41 lines
1.3 KiB
Python
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
from builtins import str, bytes, dict, int
|
|
|
|
import os
|
|
import sys
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
from pattern.web import Newsfeed, plaintext, URL
|
|
from pattern.db import date
|
|
|
|
# This example reads a given RSS or Atom newsfeed channel.
|
|
# Some example feeds to try out:
|
|
NATURE = "http://feeds.nature.com/nature/rss/current"
|
|
SCIENCE = "http://www.sciencemag.org/rss/podcast.xml"
|
|
NYT = "http://rss.nytimes.com/services/xml/rss/nyt/GlobalHome.xml"
|
|
TIME = "http://feeds.feedburner.com/time/topstories"
|
|
CNN = "http://rss.cnn.com/rss/edition.rss"
|
|
|
|
engine = Newsfeed()
|
|
|
|
for result in engine.search(CNN, cached=True):
|
|
print(result.title.upper())
|
|
print(plaintext(result.text)) # Remove HTML formatting.
|
|
print(result.url)
|
|
print(result.date)
|
|
print("")
|
|
|
|
# News item URL's lead to the page with the full article.
|
|
# This page can have any kind of formatting.
|
|
# There is no default way to read it.
|
|
# But we could just download the source HTML and convert it to plain text:
|
|
|
|
#html = URL(result.url).download()
|
|
#print(plaintext(html))
|
|
|
|
# The resulting text may contain a lot of garbage.
|
|
# A better way is to use a DOM parser to select the HTML elements we want.
|
|
# This is demonstrated in one of the next examples.
|