You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
65 lines
2.1 KiB
Python
65 lines
2.1 KiB
Python
from bs4 import BeautifulSoup
|
|
from bs4.element import Comment
|
|
import urllib.request
|
|
|
|
from reportlab.pdfbase import pdfmetrics
|
|
from reportlab.platypus import SimpleDocTemplate
|
|
from reportlab.lib.styles import getSampleStyleSheet
|
|
from reportlab.platypus import Paragraph
|
|
from reportlab.platypus import Flowable
|
|
from reportlab.pdfbase.ttfonts import TTFont
|
|
|
|
import os
|
|
import sys
|
|
import datetime
|
|
|
|
def tag_visible(element):
|
|
# Filter function to get all visible elements
|
|
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
|
|
return False
|
|
if isinstance(element, Comment):
|
|
return False
|
|
return True
|
|
|
|
def get_script_path():
|
|
return os.path.dirname(os.path.realpath(sys.argv[0]))
|
|
|
|
def text_from_html(body):
|
|
# Scrape all visible text from a url
|
|
soup = BeautifulSoup(body, 'html.parser')
|
|
texts = soup.findAll(text=True)
|
|
visible_texts = filter(tag_visible, texts)
|
|
return u"<br />".join(t.strip() for t in visible_texts)
|
|
|
|
def text_to_pdf(content):
|
|
# Create a buffer file, set-up title and file name
|
|
date = datetime.datetime.now()
|
|
my_doc = SimpleDocTemplate("Day-nytimes-" + str(date.strftime("%Y%m%d")) + '.pdf')
|
|
title = 'DAY - nytimes.com ' + date.strftime("%x")
|
|
my_doc.title = title
|
|
|
|
# Set the styles for the title and body
|
|
sample_style_sheet = getSampleStyleSheet()
|
|
pdfmetrics.registerFont(TTFont('Times_New_Roman',
|
|
get_script_path() + "/font/Times_New_Roman.ttf"))
|
|
|
|
title_style = sample_style_sheet['Heading1']
|
|
title_style.fontName = 'Times_New_Roman'
|
|
title_style.fontSize = 24
|
|
|
|
body_style = sample_style_sheet['BodyText']
|
|
body_style.fontName = 'Times_New_Roman'
|
|
body_style.fontSize = 12
|
|
|
|
# Collect content and write to PDF
|
|
flowables = []
|
|
title_text = Paragraph(title, sample_style_sheet['Heading1'])
|
|
body_text = Paragraph(content, sample_style_sheet['BodyText'])
|
|
flowables += [title_text, body_text]
|
|
my_doc.build(flowables)
|
|
|
|
return True
|
|
|
|
html = urllib.request.urlopen('http://www.nytimes.com').read()
|
|
print(text_from_html(html))
|
|
text_to_pdf(text_from_html(html)) |