You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

65 lines
2.1 KiB
Python

from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
from reportlab.pdfbase import pdfmetrics
from reportlab.platypus import SimpleDocTemplate
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import Paragraph
from reportlab.platypus import Flowable
from reportlab.pdfbase.ttfonts import TTFont
import os
import sys
import datetime
def tag_visible(element):
# Filter function to get all visible elements
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def get_script_path():
return os.path.dirname(os.path.realpath(sys.argv[0]))
def text_from_html(body):
# Scrape all visible text from a url
soup = BeautifulSoup(body, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u"<br />".join(t.strip() for t in visible_texts)
def text_to_pdf(content):
# Create a buffer file, set-up title and file name
date = datetime.datetime.now()
my_doc = SimpleDocTemplate("Day-nytimes-" + str(date.strftime("%Y%m%d")) + '.pdf')
title = 'DAY - nytimes.com ' + date.strftime("%x")
my_doc.title = title
# Set the styles for the title and body
sample_style_sheet = getSampleStyleSheet()
pdfmetrics.registerFont(TTFont('Times_New_Roman',
get_script_path() + "/font/Times_New_Roman.ttf"))
title_style = sample_style_sheet['Heading1']
title_style.fontName = 'Times_New_Roman'
title_style.fontSize = 24
body_style = sample_style_sheet['BodyText']
body_style.fontName = 'Times_New_Roman'
body_style.fontSize = 12
# Collect content and write to PDF
flowables = []
title_text = Paragraph(title, sample_style_sheet['Heading1'])
body_text = Paragraph(content, sample_style_sheet['BodyText'])
flowables += [title_text, body_text]
my_doc.build(flowables)
return True
html = urllib.request.urlopen('http://www.nytimes.com').read()
print(text_from_html(html))
text_to_pdf(text_from_html(html))