You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
98 lines
3.2 KiB
Python
98 lines
3.2 KiB
Python
1 year ago
|
from argparse import ArgumentParser
|
||
|
from pathlib import Path
|
||
|
import re
|
||
|
import sys
|
||
|
from hashlib import sha1
|
||
|
from xml.etree import ElementTree as ET
|
||
|
|
||
|
|
||
|
ET.register_namespace('',"http://www.idpf.org/2007/opf")
|
||
|
ET.register_namespace('dc','http://purl.org/dc/elements/1.1/')
|
||
|
|
||
|
def rewrite_opf(from_path, to_path, calibre_id, title):
|
||
|
et = ET.parse(from_path)
|
||
|
# print (ET.tostring(et.getroot(), method="xml", encoding="unicode"))
|
||
|
for tag in et.findall(".//{http://purl.org/dc/elements/1.1/}identifier[@{http://www.idpf.org/2007/opf}scheme='calibre']"):
|
||
|
print ("found calibre_id", tag.text)
|
||
|
tag.text = f"{calibre_id}"
|
||
|
for tag in et.findall(".//{http://purl.org/dc/elements/1.1/}title"):
|
||
|
if tag.text == "Untitled":
|
||
|
title = title.split("/", 1)[-1]
|
||
|
title = title.strip()
|
||
|
tag.text = title
|
||
|
et.write(to_path, xml_declaration = True, encoding = 'utf-8', method = 'xml')
|
||
|
|
||
|
ap= ArgumentParser("add_missing_opf", description="add missing OPF files from one Calibre library to another, preserving calibre_id's")
|
||
|
ap.add_argument("library", help="the library to fix (keep)")
|
||
|
ap.add_argument("tmp", help="the tmp library to use as source of OPF files")
|
||
|
args = ap.parse_args()
|
||
|
|
||
|
def hash_file (path):
|
||
|
ret = sha1()
|
||
|
with open(path, "rb") as fin:
|
||
|
while True:
|
||
|
data = fin.read()
|
||
|
if not data:
|
||
|
break
|
||
|
ret.update(data)
|
||
|
return ret.hexdigest()
|
||
|
|
||
|
def metadata_files (library_path):
|
||
|
for author_folder in Path(library_path).glob("*"):
|
||
|
if author_folder.is_dir():
|
||
|
for title_folder in author_folder.glob("*"):
|
||
|
m = re.search(r"^(.*)\((\d+)\)$", title_folder.name)
|
||
|
# print (f"{title_folder}, {m.groups()}")
|
||
|
if title_folder.is_dir() and m is not None:
|
||
|
title = m.group(1)
|
||
|
calibre_id = int(m.group(2))
|
||
|
opf = title_folder / "metadata.opf"
|
||
|
# opf = list(title_folder.glob("metadata.opf"))
|
||
|
title_id = f"{author_folder.name}/{title}"
|
||
|
|
||
|
book_media = list(title_folder.glob("*.pdf")) + list(title_folder.glob("*.epub"))
|
||
|
book_media.sort()
|
||
|
if len(book_media):
|
||
|
title_hash = hash_file(book_media[0])
|
||
|
else:
|
||
|
title_hash = None
|
||
|
print (f"warning: no book media {title_folder}")
|
||
|
|
||
|
yield title_id, title_hash, calibre_id, opf
|
||
|
|
||
|
|
||
|
print (f"PASS 1: {args.tmp}")
|
||
|
metadata_by_title = {}
|
||
|
for title, title_hash, calibre_id, metadata in metadata_files(args.tmp):
|
||
|
if metadata:
|
||
|
if title_hash in metadata_by_title:
|
||
|
print (f"warning: duplicate title: {title_hash}", file=sys.stderr)
|
||
|
metadata_by_title[title_hash] = metadata
|
||
|
print ()
|
||
|
print (f"PASS 2: {args.library}")
|
||
|
replaced_count = 0
|
||
|
missing_count = 0
|
||
|
existing_count = 0
|
||
|
|
||
|
for title, title_hash, calibre_id, metadata in metadata_files(args.library):
|
||
|
if not metadata.exists():
|
||
|
if title_hash not in metadata_by_title:
|
||
|
print (f"NO METADATA for {title}")
|
||
|
missing_count += 1
|
||
|
continue
|
||
|
else:
|
||
|
old_metadata = metadata_by_title[title_hash]
|
||
|
# print (f"have metadata for {title}")
|
||
|
replaced_count += 1
|
||
|
print (old_metadata)
|
||
|
rewrite_opf(old_metadata, metadata.parent / "metadata_new.opf", calibre_id, title)
|
||
|
else:
|
||
|
existing_count += 1
|
||
|
|
||
|
total = existing_count + replaced_count + missing_count
|
||
|
print (f"Of {total} items, {existing_count} had metadata, {replaced_count} can be replaced, {missing_count} missing.")
|
||
|
|
||
|
|
||
|
|
||
|
|