You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

98 lines
3.2 KiB
Python

from argparse import ArgumentParser
from pathlib import Path
import re
import sys
from hashlib import sha1
from xml.etree import ElementTree as ET
ET.register_namespace('',"http://www.idpf.org/2007/opf")
ET.register_namespace('dc','http://purl.org/dc/elements/1.1/')
def rewrite_opf(from_path, to_path, calibre_id, title):
et = ET.parse(from_path)
# print (ET.tostring(et.getroot(), method="xml", encoding="unicode"))
for tag in et.findall(".//{http://purl.org/dc/elements/1.1/}identifier[@{http://www.idpf.org/2007/opf}scheme='calibre']"):
print ("found calibre_id", tag.text)
tag.text = f"{calibre_id}"
for tag in et.findall(".//{http://purl.org/dc/elements/1.1/}title"):
if tag.text == "Untitled":
title = title.split("/", 1)[-1]
title = title.strip()
tag.text = title
et.write(to_path, xml_declaration = True, encoding = 'utf-8', method = 'xml')
ap= ArgumentParser("add_missing_opf", description="add missing OPF files from one Calibre library to another, preserving calibre_id's")
ap.add_argument("library", help="the library to fix (keep)")
ap.add_argument("tmp", help="the tmp library to use as source of OPF files")
args = ap.parse_args()
def hash_file (path):
ret = sha1()
with open(path, "rb") as fin:
while True:
data = fin.read()
if not data:
break
ret.update(data)
return ret.hexdigest()
def metadata_files (library_path):
for author_folder in Path(library_path).glob("*"):
if author_folder.is_dir():
for title_folder in author_folder.glob("*"):
m = re.search(r"^(.*)\((\d+)\)$", title_folder.name)
# print (f"{title_folder}, {m.groups()}")
if title_folder.is_dir() and m is not None:
title = m.group(1)
calibre_id = int(m.group(2))
opf = title_folder / "metadata.opf"
# opf = list(title_folder.glob("metadata.opf"))
title_id = f"{author_folder.name}/{title}"
book_media = list(title_folder.glob("*.pdf")) + list(title_folder.glob("*.epub"))
book_media.sort()
if len(book_media):
title_hash = hash_file(book_media[0])
else:
title_hash = None
print (f"warning: no book media {title_folder}")
yield title_id, title_hash, calibre_id, opf
print (f"PASS 1: {args.tmp}")
metadata_by_title = {}
for title, title_hash, calibre_id, metadata in metadata_files(args.tmp):
if metadata:
if title_hash in metadata_by_title:
print (f"warning: duplicate title: {title_hash}", file=sys.stderr)
metadata_by_title[title_hash] = metadata
print ()
print (f"PASS 2: {args.library}")
replaced_count = 0
missing_count = 0
existing_count = 0
for title, title_hash, calibre_id, metadata in metadata_files(args.library):
if not metadata.exists():
if title_hash not in metadata_by_title:
print (f"NO METADATA for {title}")
missing_count += 1
continue
else:
old_metadata = metadata_by_title[title_hash]
# print (f"have metadata for {title}")
replaced_count += 1
print (old_metadata)
rewrite_opf(old_metadata, metadata.parent / "metadata_new.opf", calibre_id, title)
else:
existing_count += 1
total = existing_count + replaced_count + missing_count
print (f"Of {total} items, {existing_count} had metadata, {replaced_count} can be replaced, {missing_count} missing.")