-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgoodreads-seriescheck.py
96 lines (82 loc) · 3.82 KB
/
goodreads-seriescheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#
# Read records from a MARC file and, using the author and title information, gets series information from Goodreads.
# The series information from Goodreads, if any, is compared to the series information in the records 490, 800 and
# and 830 fields, if any.
#
# Usage: python goodreads-seriescheck.py -if <MARC input file> [ -of <output file> ] [ -sep <separator> ]
#
# The output file is written in Excel-compatible CSV file. The output file defaults to the console.
# The separator defaults to comma.
#
# The columns in the CSV file are: an index number, the 100a, the 245a, the 490a or None, the 800t or 830a or None,
# the series information from Goodreads, and a 'Y' indication if the series information from Goodreads does not
# matches the series information in the record.
#
# If you find yourself wishing for a different format.
#
# Version: 0.2.0 2/8/25
#
# License: CC BY-NC-SA 4.0, https://creativecommons.org/licenses/by-nc-sa/4.0/
#
# Graeme Williams
#
import argparse
import contextlib
import csv
import pymarc
import requests
import sys
from lib import goodreads
def get_subfield(theRecord, tag, subfield):
if not (fld := theRecord.get(tag)):
return "None"
if subfield := fld.get(subfield):
return subfield
else:
return "None"
def get_series(theRecord):
if (fld := theRecord.get("800")):
return fld.get("t") or "None"
elif (fld := theRecord.get("830")):
return fld.get("a") or "None"
else:
return "None"
def compare_series(series_490, series_800, gr_series):
if not gr_series:
return "None"
return (gr_series == series_490) or (gr_series == series_800)
# wrapper so stdout looks like a file with a context manager
def create_file_context(file, mode="", encoding="utf-8"):
if isinstance(file, str):
# If string, open file
return open(file, mode=mode, encoding=encoding)
else:
# Caller is responsible for closing file
return contextlib.nullcontext(file)
parser = argparse.ArgumentParser()
parser.add_argument("--inputfile", "-if", help="MARC file to read", required=True)
parser.add_argument("--outputfile", "-of", help="CSV output file", required=False, default=sys.stdout)
parser.add_argument("--separator", "-sep", help="CSV separator character(s)", required=False, default=",")
args = parser.parse_args()
with requests.Session() as session:
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})
# Iterate over the records in the file and call Goodreads to get the series
with ( open(args.inputfile, 'rb') as marc_input,
create_file_context(args.outputfile, mode="w", encoding="utf-8") as csv_output):
writer = csv.writer(csv_output, delimiter=args.separator)
writer.writerow(["Index", "100a", "245a", "490a", "800t/830a", "Goodreads series", "Mismatch?"])
for i, aRecord in enumerate(pymarc.MARCReader(marc_input)):
author = get_subfield(aRecord, "100", "a").strip(",")
title = get_subfield(aRecord, "245", "a").strip(" /:")
series_490 = get_subfield(aRecord, "490", "a").strip(" ;")
series_8XX = get_series(aRecord).strip(" ;")
# get data from Goodreads corresponding to the record
worknumber = goodreads.get_worknumber(session, author.split(',')[0], title)
if not worknumber:
gr_series = "No title match"
indicator = " "
else:
gr_series = goodreads.get_seriesname(session, worknumber)
indicator = " " if compare_series(series_490, series_8XX, gr_series) else "Y"
writer.writerow([i, author, title, series_490, series_8XX, gr_series, indicator])