Muckrock platform #116 ca_fort_bragg_pd #139 ca_livermore_pd #138 plu…

…s some credentialling #130 corrections #140 fresno #122 (#120) Co-authored-by: Mike Stucka <[email protected]> Co-authored-by: Gerald Rich <[email protected]>
biglocalnews · Oct 22, 2024 · 1170a90 · 1170a90
1 parent 6e7bada
commit 1170a90
Show file tree

Hide file tree

Showing 13 changed files with 1,074 additions and 446 deletions.
diff --git a/.gitignore b/.gitignore
@@ -131,7 +131,7 @@ Session.vim
 Sessionx.vim
 
 # Temporary
-tmp/
+tmp
 .netrwhist
 *~
 # Auto-generated tag files
@@ -140,3 +140,6 @@ tags
 [._]*.un~
 
 .vscode
+
+muckrock-api.txt
+credentials.json
diff --git a/Pipfile b/Pipfile
@@ -41,6 +41,7 @@ typing-extensions = "*"
 us = "*"
 pytube = "*"
 clean-scraper = {file = ".", editable = true}
+python-dotenv = "*"
 
 [pipenv]
 allow_prereleases = false
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/clean/ca/california_department_of_corrections_and_rehabilitation.py b/clean/ca/california_department_of_corrections_and_rehabilitation.py
@@ -0,0 +1,80 @@
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+from .. import utils
+from ..cache import Cache
+from ..platforms.muckrock import process_muckrock
+
+# from ..utils import MetadataDict
+
+logger = logging.getLogger(__name__)
+
+"""
+DANGER DANGER DANGER!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+There may be multiple GovQA repositories associated with this request, but Stucka can't find 'em.
+
+As of Oct. 20, 2024, this is only grabbing the Muckrock side of things.
+"""
+
+
+class Site:
+    """Scrape file metadata for the California Department of Corrections and Rehabilitation.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "California Department of Corrections and Rehabilitation"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.site_slug = "ca_california_department_of_corrections_and_rehabilitation"
+        self.base_url = "https://www.muckrock.com/foi/california-52/sb1421-records-2022-122682"  # Embargoed
+        # Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
+        # along with additional index pages
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
+        self.cache = Cache(cache_dir)
+        for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
+            utils.create_directory(localdir)
+
+    def scrape_meta(self, throttle: int = 2) -> Path:
+        """Gather metadata on downloadable files (videos, etc.).
+
+        Args:
+            throttle (int): Number of seconds to wait between requests. Defaults to 0.
+
+        Returns:
+            Path: Local path of JSON file containing metadata on downloadable files
+        """
+        to_be_scraped: Dict = {
+            self.base_url: True,
+        }
+
+        metadata: List = []
+
+        subpages_dir = self.subpages_dir
+
+        api_key = utils.get_credentials("MUCKROCK_CRP")
+
+        for start_url in to_be_scraped:
+            force = to_be_scraped[start_url]
+            local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
+            metadata.extend(local_metadata)
+
+        json_filename = self.data_dir / (self.site_slug + ".json")
+        self.cache.write_json(json_filename, metadata)
+
+        return json_filename
diff --git a/clean/ca/fort_bragg_pd.py b/clean/ca/fort_bragg_pd.py
@@ -0,0 +1,72 @@
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+from .. import utils
+from ..cache import Cache
+from ..platforms.muckrock import process_muckrock
+
+# from ..utils import MetadataDict
+
+logger = logging.getLogger(__name__)
+
+
+class Site:
+    """Scrape file metadata for the Fort Bragg Police Department.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Fort Bragg Police Department"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.site_slug = "ca_fort_bragg_pd"
+        self.base_url = "https://www.muckrock.com/foi/fort-bragg-3216/sb1421-records-2022-122822"  # Embargoed
+        # Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
+        # along with additional index pages
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
+        self.cache = Cache(cache_dir)
+        for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
+            utils.create_directory(localdir)
+
+    def scrape_meta(self, throttle: int = 2) -> Path:
+        """Gather metadata on downloadable files (videos, etc.).
+
+        Args:
+            throttle (int): Number of seconds to wait between requests. Defaults to 0.
+
+        Returns:
+            Path: Local path of JSON file containing metadata on downloadable files
+        """
+        to_be_scraped: Dict = {
+            self.base_url: True,
+        }
+
+        metadata: List = []
+
+        subpages_dir = self.subpages_dir
+
+        api_key = utils.get_credentials("MUCKROCK_CRP")
+
+        for start_url in to_be_scraped:
+            force = to_be_scraped[start_url]
+            local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
+            metadata.extend(local_metadata)
+
+        json_filename = self.data_dir / (self.site_slug + ".json")
+        self.cache.write_json(json_filename, metadata)
+
+        return json_filename
diff --git a/clean/ca/fresno_pd.py b/clean/ca/fresno_pd.py
@@ -0,0 +1,76 @@
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+from .. import utils
+from ..cache import Cache
+from ..platforms.muckrock import process_muckrock
+
+# from ..utils import MetadataDict
+
+logger = logging.getLogger(__name__)
+
+
+class Site:
+    """Scrape file metadata for the Fresno Police Department.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Fresno Police Department"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.site_slug = "ca_fresno_pd"
+        self.base_url = (
+            "https://www.muckrock.com/foi/fresno-3222/sb1421-records-85695"  # Embargoed
+        )
+        # Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
+        # along with additional index pages
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
+        self.cache = Cache(cache_dir)
+        for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
+            utils.create_directory(localdir)
+
+    def scrape_meta(self, throttle: int = 2) -> Path:
+        """Gather metadata on downloadable files (videos, etc.).
+
+        Args:
+            throttle (int): Number of seconds to wait between requests. Defaults to 0.
+
+        Returns:
+            Path: Local path of JSON file containing metadata on downloadable files
+        """
+        to_be_scraped: Dict = {
+            "https://www.muckrock.com/foi/fresno-3222/sb1421-records-85695": True,
+            "https://www.muckrock.com/foi/fresno-3222/sb1421-records-2022-122831/": True,
+            "www.muckrock.com/foi/fresno-3222/2023-sb1421sb16-request-fresno-police-department-138622": True,
+        }
+
+        metadata: List = []
+
+        subpages_dir = self.subpages_dir
+
+        api_key = utils.get_credentials("MUCKROCK_CRP")
+
+        for start_url in to_be_scraped:
+            force = to_be_scraped[start_url]
+            local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
+            metadata.extend(local_metadata)
+
+        json_filename = self.data_dir / (self.site_slug + ".json")
+        self.cache.write_json(json_filename, metadata)
+
+        return json_filename
diff --git a/clean/ca/livermore_pd.py b/clean/ca/livermore_pd.py
@@ -0,0 +1,74 @@
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+from .. import utils
+from ..cache import Cache
+from ..platforms.muckrock import process_muckrock
+
+# from ..utils import MetadataDict
+
+logger = logging.getLogger(__name__)
+
+
+class Site:
+    """Scrape file metadata for the Livermore Police Department.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Livermore Police Department"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.site_slug = "ca_livermore_pd"
+        self.base_url = "https://www.muckrock.com/foi/livermore-3295/2023-sb1421sb16-request-livermore-police-department-140174"  # Embargoed
+        # Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
+        # along with additional index pages
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
+        self.cache = Cache(cache_dir)
+        for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
+            utils.create_directory(localdir)
+
+    def scrape_meta(self, throttle: int = 2) -> Path:
+        """Gather metadata on downloadable files (videos, etc.).
+
+        Args:
+            throttle (int): Number of seconds to wait between requests. Defaults to 0.
+
+        Returns:
+            Path: Local path of JSON file containing metadata on downloadable files
+        """
+        to_be_scraped: Dict = {
+            "https://www.muckrock.com/foi/livermore-3295/2023-sb1421sb16-request-livermore-police-department-140174": True,
+            "https://www.muckrock.com/foi/livermore-3295/sb1421-records-2022-122912": True,
+            "https://www.muckrock.com/foi/livermore-3295/sb1421-records-85738/#files": True,
+        }
+
+        metadata: List = []
+
+        subpages_dir = self.subpages_dir
+
+        api_key = utils.get_credentials("MUCKROCK_CRP")
+
+        for start_url in to_be_scraped:
+            force = to_be_scraped[start_url]
+            local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
+            metadata.extend(local_metadata)
+
+        json_filename = self.data_dir / (self.site_slug + ".json")
+        self.cache.write_json(json_filename, metadata)
+
+        return json_filename