Skip to content

Commit

Permalink
Muckrock platform #116 ca_fort_bragg_pd #139 ca_livermore_pd #138 plu…
Browse files Browse the repository at this point in the history
…s some credentialling #130 corrections #140 fresno #122 (#120)

Co-authored-by: Mike Stucka <[email protected]>
Co-authored-by: Gerald Rich <[email protected]>
  • Loading branch information
3 people authored Oct 22, 2024
1 parent 6e7bada commit 1170a90
Show file tree
Hide file tree
Showing 13 changed files with 1,074 additions and 446 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ Session.vim
Sessionx.vim

# Temporary
tmp/
tmp
.netrwhist
*~
# Auto-generated tag files
Expand All @@ -140,3 +140,6 @@ tags
[._]*.un~

.vscode

muckrock-api.txt
credentials.json
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ typing-extensions = "*"
us = "*"
pytube = "*"
clean-scraper = {file = ".", editable = true}
python-dotenv = "*"

[pipenv]
allow_prereleases = false
991 changes: 547 additions & 444 deletions Pipfile.lock

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import logging
from pathlib import Path
from typing import Dict, List

from .. import utils
from ..cache import Cache
from ..platforms.muckrock import process_muckrock

# from ..utils import MetadataDict

logger = logging.getLogger(__name__)

"""
DANGER DANGER DANGER!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
There may be multiple GovQA repositories associated with this request, but Stucka can't find 'em.
As of Oct. 20, 2024, this is only grabbing the Muckrock side of things.
"""


class Site:
"""Scrape file metadata for the California Department of Corrections and Rehabilitation.
Attributes:
name (str): The official name of the agency
"""

name = "California Department of Corrections and Rehabilitation"

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.
Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.site_slug = "ca_california_department_of_corrections_and_rehabilitation"
self.base_url = "https://www.muckrock.com/foi/california-52/sb1421-records-2022-122682" # Embargoed
# Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
# along with additional index pages
self.data_dir = data_dir
self.cache_dir = cache_dir
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
self.cache = Cache(cache_dir)
for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
utils.create_directory(localdir)

def scrape_meta(self, throttle: int = 2) -> Path:
"""Gather metadata on downloadable files (videos, etc.).
Args:
throttle (int): Number of seconds to wait between requests. Defaults to 0.
Returns:
Path: Local path of JSON file containing metadata on downloadable files
"""
to_be_scraped: Dict = {
self.base_url: True,
}

metadata: List = []

subpages_dir = self.subpages_dir

api_key = utils.get_credentials("MUCKROCK_CRP")

for start_url in to_be_scraped:
force = to_be_scraped[start_url]
local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
metadata.extend(local_metadata)

json_filename = self.data_dir / (self.site_slug + ".json")
self.cache.write_json(json_filename, metadata)

return json_filename
72 changes: 72 additions & 0 deletions clean/ca/fort_bragg_pd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import logging
from pathlib import Path
from typing import Dict, List

from .. import utils
from ..cache import Cache
from ..platforms.muckrock import process_muckrock

# from ..utils import MetadataDict

logger = logging.getLogger(__name__)


class Site:
"""Scrape file metadata for the Fort Bragg Police Department.
Attributes:
name (str): The official name of the agency
"""

name = "Fort Bragg Police Department"

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.
Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.site_slug = "ca_fort_bragg_pd"
self.base_url = "https://www.muckrock.com/foi/fort-bragg-3216/sb1421-records-2022-122822" # Embargoed
# Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
# along with additional index pages
self.data_dir = data_dir
self.cache_dir = cache_dir
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
self.cache = Cache(cache_dir)
for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
utils.create_directory(localdir)

def scrape_meta(self, throttle: int = 2) -> Path:
"""Gather metadata on downloadable files (videos, etc.).
Args:
throttle (int): Number of seconds to wait between requests. Defaults to 0.
Returns:
Path: Local path of JSON file containing metadata on downloadable files
"""
to_be_scraped: Dict = {
self.base_url: True,
}

metadata: List = []

subpages_dir = self.subpages_dir

api_key = utils.get_credentials("MUCKROCK_CRP")

for start_url in to_be_scraped:
force = to_be_scraped[start_url]
local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
metadata.extend(local_metadata)

json_filename = self.data_dir / (self.site_slug + ".json")
self.cache.write_json(json_filename, metadata)

return json_filename
76 changes: 76 additions & 0 deletions clean/ca/fresno_pd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import logging
from pathlib import Path
from typing import Dict, List

from .. import utils
from ..cache import Cache
from ..platforms.muckrock import process_muckrock

# from ..utils import MetadataDict

logger = logging.getLogger(__name__)


class Site:
"""Scrape file metadata for the Fresno Police Department.
Attributes:
name (str): The official name of the agency
"""

name = "Fresno Police Department"

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.
Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.site_slug = "ca_fresno_pd"
self.base_url = (
"https://www.muckrock.com/foi/fresno-3222/sb1421-records-85695" # Embargoed
)
# Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
# along with additional index pages
self.data_dir = data_dir
self.cache_dir = cache_dir
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
self.cache = Cache(cache_dir)
for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
utils.create_directory(localdir)

def scrape_meta(self, throttle: int = 2) -> Path:
"""Gather metadata on downloadable files (videos, etc.).
Args:
throttle (int): Number of seconds to wait between requests. Defaults to 0.
Returns:
Path: Local path of JSON file containing metadata on downloadable files
"""
to_be_scraped: Dict = {
"https://www.muckrock.com/foi/fresno-3222/sb1421-records-85695": True,
"https://www.muckrock.com/foi/fresno-3222/sb1421-records-2022-122831/": True,
"www.muckrock.com/foi/fresno-3222/2023-sb1421sb16-request-fresno-police-department-138622": True,
}

metadata: List = []

subpages_dir = self.subpages_dir

api_key = utils.get_credentials("MUCKROCK_CRP")

for start_url in to_be_scraped:
force = to_be_scraped[start_url]
local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
metadata.extend(local_metadata)

json_filename = self.data_dir / (self.site_slug + ".json")
self.cache.write_json(json_filename, metadata)

return json_filename
74 changes: 74 additions & 0 deletions clean/ca/livermore_pd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import logging
from pathlib import Path
from typing import Dict, List

from .. import utils
from ..cache import Cache
from ..platforms.muckrock import process_muckrock

# from ..utils import MetadataDict

logger = logging.getLogger(__name__)


class Site:
"""Scrape file metadata for the Livermore Police Department.
Attributes:
name (str): The official name of the agency
"""

name = "Livermore Police Department"

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.
Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.site_slug = "ca_livermore_pd"
self.base_url = "https://www.muckrock.com/foi/livermore-3295/2023-sb1421sb16-request-livermore-police-department-140174" # Embargoed
# Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
# along with additional index pages
self.data_dir = data_dir
self.cache_dir = cache_dir
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
self.cache = Cache(cache_dir)
for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
utils.create_directory(localdir)

def scrape_meta(self, throttle: int = 2) -> Path:
"""Gather metadata on downloadable files (videos, etc.).
Args:
throttle (int): Number of seconds to wait between requests. Defaults to 0.
Returns:
Path: Local path of JSON file containing metadata on downloadable files
"""
to_be_scraped: Dict = {
"https://www.muckrock.com/foi/livermore-3295/2023-sb1421sb16-request-livermore-police-department-140174": True,
"https://www.muckrock.com/foi/livermore-3295/sb1421-records-2022-122912": True,
"https://www.muckrock.com/foi/livermore-3295/sb1421-records-85738/#files": True,
}

metadata: List = []

subpages_dir = self.subpages_dir

api_key = utils.get_credentials("MUCKROCK_CRP")

for start_url in to_be_scraped:
force = to_be_scraped[start_url]
local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
metadata.extend(local_metadata)

json_filename = self.data_dir / (self.site_slug + ".json")
self.cache.write_json(json_filename, metadata)

return json_filename
Loading

0 comments on commit 1170a90

Please sign in to comment.