You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

98 lines
3.2 KiB
Python

1 year ago
from argparse import ArgumentParser
from pathlib import Path
import re
import sys
from hashlib import sha1
from xml.etree import ElementTree as ET
ET.register_namespace('',"http://www.idpf.org/2007/opf")
ET.register_namespace('dc','http://purl.org/dc/elements/1.1/')
def rewrite_opf(from_path, to_path, calibre_id, title):
et = ET.parse(from_path)
# print (ET.tostring(et.getroot(), method="xml", encoding="unicode"))
for tag in et.findall(".//{http://purl.org/dc/elements/1.1/}identifier[@{http://www.idpf.org/2007/opf}scheme='calibre']"):
print ("found calibre_id", tag.text)
tag.text = f"{calibre_id}"
for tag in et.findall(".//{http://purl.org/dc/elements/1.1/}title"):
if tag.text == "Untitled":
title = title.split("/", 1)[-1]
title = title.strip()
tag.text = title
et.write(to_path, xml_declaration = True, encoding = 'utf-8', method = 'xml')
ap= ArgumentParser("add_missing_opf", description="add missing OPF files from one Calibre library to another, preserving calibre_id's")
ap.add_argument("library", help="the library to fix (keep)")
ap.add_argument("tmp", help="the tmp library to use as source of OPF files")
args = ap.parse_args()
def hash_file (path):
ret = sha1()
with open(path, "rb") as fin:
while True:
data = fin.read()
if not data:
break
ret.update(data)
return ret.hexdigest()
def metadata_files (library_path):
for author_folder in Path(library_path).glob("*"):
if author_folder.is_dir():
for title_folder in author_folder.glob("*"):
m = re.search(r"^(.*)\((\d+)\)$", title_folder.name)
# print (f"{title_folder}, {m.groups()}")
if title_folder.is_dir() and m is not None:
title = m.group(1)
calibre_id = int(m.group(2))
opf = title_folder / "metadata.opf"
# opf = list(title_folder.glob("metadata.opf"))
title_id = f"{author_folder.name}/{title}"
book_media = list(title_folder.glob("*.pdf")) + list(title_folder.glob("*.epub"))
book_media.sort()
if len(book_media):
title_hash = hash_file(book_media[0])
else:
title_hash = None
print (f"warning: no book media {title_folder}")
yield title_id, title_hash, calibre_id, opf
print (f"PASS 1: {args.tmp}")
metadata_by_title = {}
for title, title_hash, calibre_id, metadata in metadata_files(args.tmp):
if metadata:
if title_hash in metadata_by_title:
print (f"warning: duplicate title: {title_hash}", file=sys.stderr)
metadata_by_title[title_hash] = metadata
print ()
print (f"PASS 2: {args.library}")
replaced_count = 0
missing_count = 0
existing_count = 0
for title, title_hash, calibre_id, metadata in metadata_files(args.library):
if not metadata.exists():
if title_hash not in metadata_by_title:
print (f"NO METADATA for {title}")
missing_count += 1
continue
else:
old_metadata = metadata_by_title[title_hash]
# print (f"have metadata for {title}")
replaced_count += 1
print (old_metadata)
rewrite_opf(old_metadata, metadata.parent / "metadata_new.opf", calibre_id, title)
else:
existing_count += 1
total = existing_count + replaced_count + missing_count
print (f"Of {total} items, {existing_count} had metadata, {replaced_count} can be replaced, {missing_count} missing.")