from argparse import ArgumentParser from pathlib import Path import re import sys from hashlib import sha1 from xml.etree import ElementTree as ET ET.register_namespace('',"http://www.idpf.org/2007/opf") ET.register_namespace('dc','http://purl.org/dc/elements/1.1/') def rewrite_opf(from_path, to_path, calibre_id, title): et = ET.parse(from_path) # print (ET.tostring(et.getroot(), method="xml", encoding="unicode")) for tag in et.findall(".//{http://purl.org/dc/elements/1.1/}identifier[@{http://www.idpf.org/2007/opf}scheme='calibre']"): print ("found calibre_id", tag.text) tag.text = f"{calibre_id}" for tag in et.findall(".//{http://purl.org/dc/elements/1.1/}title"): if tag.text == "Untitled": title = title.split("/", 1)[-1] title = title.strip() tag.text = title et.write(to_path, xml_declaration = True, encoding = 'utf-8', method = 'xml') ap= ArgumentParser("add_missing_opf", description="add missing OPF files from one Calibre library to another, preserving calibre_id's") ap.add_argument("library", help="the library to fix (keep)") ap.add_argument("tmp", help="the tmp library to use as source of OPF files") args = ap.parse_args() def hash_file (path): ret = sha1() with open(path, "rb") as fin: while True: data = fin.read() if not data: break ret.update(data) return ret.hexdigest() def metadata_files (library_path): for author_folder in Path(library_path).glob("*"): if author_folder.is_dir(): for title_folder in author_folder.glob("*"): m = re.search(r"^(.*)\((\d+)\)$", title_folder.name) # print (f"{title_folder}, {m.groups()}") if title_folder.is_dir() and m is not None: title = m.group(1) calibre_id = int(m.group(2)) opf = title_folder / "metadata.opf" # opf = list(title_folder.glob("metadata.opf")) title_id = f"{author_folder.name}/{title}" book_media = list(title_folder.glob("*.pdf")) + list(title_folder.glob("*.epub")) book_media.sort() if len(book_media): title_hash = hash_file(book_media[0]) else: title_hash = None print (f"warning: no book media {title_folder}") yield title_id, title_hash, calibre_id, opf print (f"PASS 1: {args.tmp}") metadata_by_title = {} for title, title_hash, calibre_id, metadata in metadata_files(args.tmp): if metadata: if title_hash in metadata_by_title: print (f"warning: duplicate title: {title_hash}", file=sys.stderr) metadata_by_title[title_hash] = metadata print () print (f"PASS 2: {args.library}") replaced_count = 0 missing_count = 0 existing_count = 0 for title, title_hash, calibre_id, metadata in metadata_files(args.library): if not metadata.exists(): if title_hash not in metadata_by_title: print (f"NO METADATA for {title}") missing_count += 1 continue else: old_metadata = metadata_by_title[title_hash] # print (f"have metadata for {title}") replaced_count += 1 print (old_metadata) rewrite_opf(old_metadata, metadata.parent / "metadata_new.opf", calibre_id, title) else: existing_count += 1 total = existing_count + replaced_count + missing_count print (f"Of {total} items, {existing_count} had metadata, {replaced_count} can be replaced, {missing_count} missing.")