add setup.sh, update README

Lucas Faudman · Lucas Faudman · commit dcca5b2d76a6 · 2024-05-02T02:39:40.000-07:00
diff --git a/README.md b/README.md
@@ -1,8 +1,15 @@
 # sans-index-generator
-Generate Indexes from SANS PDFs
+**Generate Indexes from SANS PDFs**
 
 > NOTE: May not work with all SANS PDFs due to different structures. Modify the `fix_text` and `extract_pdf_text` methods in `extractpdfs.py` to match the structure of the PDFs you are working with if errors occur.
 
+## Setup
+Run the following command to clone the repository and run the setup script.
+```bash
+git clone https://github.com/LucasFaudman/sans-index-generator && cd sans-index-generator && chmod +x setup.sh && ./setup.sh
+```
+
+## Usage
 ```bash
 usage: extractpdfs.py [-h] [-P PASSWORD] [-O OUT] [--maxwidth MAXWIDTH]
                       [--only-page-order] [--only-alpha]
@@ -43,7 +50,7 @@ optional arguments:
                         Save index to file
 ```
 
-### Example Output
+## Example Output
 ```
 560/SEC560-Book1.pdf:
 
diff --git a/extractpdfs.py b/extractpdfs.py
@@ -1,10 +1,11 @@
 import argparse
 import re
 import json
+from sys import stdout, stderr
 from pathlib import Path
-from collections import defaultdict, OrderedDict
+from collections import defaultdict
 from concurrent.futures import ProcessPoolExecutor
-from sys import stdout, stderr
+
 from textwrap import TextWrapper
 from PyPDF2 import PdfReader
 
@@ -83,7 +84,7 @@ def make_index(file_pages, keep_roadmap=False, keep_toc=False, keep_continuation
     index = defaultdict(dict)
     for filename, pages in file_pages.items():
         for page_num, (header, text, references) in pages.items():
-            if not keep_roadmap and header in ["Course Roadmap", "Course Outline"]:
+            if not keep_roadmap and header.startswith(("Course Roadmap", "Course Outline")):
                 continue
             if not keep_toc and header == "TABLE OF CONTENTS":
                 continue
@@ -122,7 +123,7 @@ def print_index_by_alpha_order(index, stream=None, maxwidth=80):
 
     def sort_fn(x): return x[0].replace(
         'The ', '', 1).replace('A ', '', 1).lower()
-    alpha_index = OrderedDict(sorted(alpha_index.items(), key=sort_fn))
+    alpha_index = dict(sorted(alpha_index.items(), key=sort_fn))
     max_pagestr_len = max(len(": " + ','.join(page_nums))
                           for page_nums in alpha_index.values())