bootleglibrary/scripts/add_missing_opf.py

from argparse import ArgumentParser
from pathlib import Path
import re
import sys
from hashlib import sha1
from xml.etree import ElementTree as ET


ET.register_namespace('',"http://www.idpf.org/2007/opf")
ET.register_namespace('dc','http://purl.org/dc/elements/1.1/')

def rewrite_opf(from_path, to_path, calibre_id, title):
	et = ET.parse(from_path)
	# print (ET.tostring(et.getroot(), method="xml", encoding="unicode"))
	for tag in et.findall(".//{http://purl.org/dc/elements/1.1/}identifier[@{http://www.idpf.org/2007/opf}scheme='calibre']"):
		print ("found calibre_id", tag.text)
		tag.text = f"{calibre_id}"
	for tag in et.findall(".//{http://purl.org/dc/elements/1.1/}title"):
		if tag.text == "Untitled":
			title = title.split("/", 1)[-1]
			title = title.strip()
			tag.text = title
	et.write(to_path, xml_declaration = True, encoding = 'utf-8', method = 'xml')

ap= ArgumentParser("add_missing_opf", description="add missing OPF files from one Calibre library to another, preserving calibre_id's")
ap.add_argument("library", help="the library to fix (keep)")
ap.add_argument("tmp", help="the tmp library to use as source of OPF files")
args = ap.parse_args()

def hash_file (path):
	ret = sha1()
	with open(path, "rb") as fin:
		while True:
			data = fin.read()
			if not data:
				break
			ret.update(data)
	return ret.hexdigest()

def metadata_files (library_path):
	for author_folder in Path(library_path).glob("*"):
		if author_folder.is_dir():
			for title_folder in author_folder.glob("*"):
				m = re.search(r"^(.*)\((\d+)\)$", title_folder.name)
				# print (f"{title_folder}, {m.groups()}")
				if title_folder.is_dir() and m is not None:
					title = m.group(1)
					calibre_id = int(m.group(2))
					opf = title_folder / "metadata.opf"
					# opf = list(title_folder.glob("metadata.opf"))
					title_id = f"{author_folder.name}/{title}"

					book_media = list(title_folder.glob("*.pdf")) + list(title_folder.glob("*.epub"))
					book_media.sort()
					if len(book_media):
						title_hash = hash_file(book_media[0])
					else:
						title_hash = None
						print (f"warning: no book media {title_folder}")

					yield title_id, title_hash, calibre_id, opf
					

print (f"PASS 1: {args.tmp}")
metadata_by_title = {}
for title, title_hash, calibre_id, metadata in metadata_files(args.tmp):
	if metadata:
		if title_hash in metadata_by_title:
			print (f"warning: duplicate title: {title_hash}", file=sys.stderr)
		metadata_by_title[title_hash] = metadata
print ()
print (f"PASS 2: {args.library}")
replaced_count = 0
missing_count = 0
existing_count = 0

for title, title_hash, calibre_id, metadata in metadata_files(args.library):
	if not metadata.exists():
		if title_hash not in metadata_by_title:
			print (f"NO METADATA for {title}")
			missing_count += 1
			continue
		else:
			old_metadata = metadata_by_title[title_hash]
			# print (f"have metadata for {title}")
			replaced_count += 1
			print (old_metadata)
			rewrite_opf(old_metadata, metadata.parent / "metadata_new.opf", calibre_id, title)
	else:
		existing_count += 1

total = existing_count + replaced_count + missing_count
print (f"Of {total} items, {existing_count} had metadata, {replaced_count} can be replaced, {missing_count} missing.")
scripts + bin 1 year ago			`from argparse import ArgumentParser`
			`from pathlib import Path`
			`import re`
			`import sys`
			`from hashlib import sha1`
			`from xml.etree import ElementTree as ET`


			`ET.register_namespace('',"http://www.idpf.org/2007/opf")`
			`ET.register_namespace('dc','http://purl.org/dc/elements/1.1/')`

			`def rewrite_opf(from_path, to_path, calibre_id, title):`
			`et = ET.parse(from_path)`
			`# print (ET.tostring(et.getroot(), method="xml", encoding="unicode"))`
			`for tag in et.findall(".//{http://purl.org/dc/elements/1.1/}identifier[@{http://www.idpf.org/2007/opf}scheme='calibre']"):`
			`print ("found calibre_id", tag.text)`
			`tag.text = f"{calibre_id}"`
			`for tag in et.findall(".//{http://purl.org/dc/elements/1.1/}title"):`
			`if tag.text == "Untitled":`
			`title = title.split("/", 1)[-1]`
			`title = title.strip()`
			`tag.text = title`
			`et.write(to_path, xml_declaration = True, encoding = 'utf-8', method = 'xml')`

			`ap= ArgumentParser("add_missing_opf", description="add missing OPF files from one Calibre library to another, preserving calibre_id's")`
			`ap.add_argument("library", help="the library to fix (keep)")`
			`ap.add_argument("tmp", help="the tmp library to use as source of OPF files")`
			`args = ap.parse_args()`

			`def hash_file (path):`
			`ret = sha1()`
			`with open(path, "rb") as fin:`
			`while True:`
			`data = fin.read()`
			`if not data:`
			`break`
			`ret.update(data)`
			`return ret.hexdigest()`

			`def metadata_files (library_path):`
			`for author_folder in Path(library_path).glob("*"):`
			`if author_folder.is_dir():`
			`for title_folder in author_folder.glob("*"):`
			`m = re.search(r"^(.*)\((\d+)\)$", title_folder.name)`
			`# print (f"{title_folder}, {m.groups()}")`
			`if title_folder.is_dir() and m is not None:`
			`title = m.group(1)`
			`calibre_id = int(m.group(2))`
			`opf = title_folder / "metadata.opf"`
			`# opf = list(title_folder.glob("metadata.opf"))`
			`title_id = f"{author_folder.name}/{title}"`

			`book_media = list(title_folder.glob(".pdf")) + list(title_folder.glob(".epub"))`
			`book_media.sort()`
			`if len(book_media):`
			`title_hash = hash_file(book_media[0])`
			`else:`
			`title_hash = None`
			`print (f"warning: no book media {title_folder}")`

			`yield title_id, title_hash, calibre_id, opf`


			`print (f"PASS 1: {args.tmp}")`
			`metadata_by_title = {}`
			`for title, title_hash, calibre_id, metadata in metadata_files(args.tmp):`
			`if metadata:`
			`if title_hash in metadata_by_title:`
			`print (f"warning: duplicate title: {title_hash}", file=sys.stderr)`
			`metadata_by_title[title_hash] = metadata`
			`print ()`
			`print (f"PASS 2: {args.library}")`
			`replaced_count = 0`
			`missing_count = 0`
			`existing_count = 0`

			`for title, title_hash, calibre_id, metadata in metadata_files(args.library):`
			`if not metadata.exists():`
			`if title_hash not in metadata_by_title:`
			`print (f"NO METADATA for {title}")`
			`missing_count += 1`
			`continue`
			`else:`
			`old_metadata = metadata_by_title[title_hash]`
			`# print (f"have metadata for {title}")`
			`replaced_count += 1`
			`print (old_metadata)`
			`rewrite_opf(old_metadata, metadata.parent / "metadata_new.opf", calibre_id, title)`
			`else:`
			`existing_count += 1`

			`total = existing_count + replaced_count + missing_count`
			`print (f"Of {total} items, {existing_count} had metadata, {replaced_count} can be replaced, {missing_count} missing.")`