Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
JackStrawFromWichita authored Nov 20, 2019
0 parents commit ffa0c08
Show file tree
Hide file tree
Showing 3 changed files with 315 additions and 0 deletions.
143 changes: 143 additions & 0 deletions README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#-----------------------------------------------------------------------------------------------------------------------

# AS A DEADHEAD, ARCHIVE.ORG IS AN INVALUABLE RESOURCE. WE ARE ALL FAMILIAR WITH SHOWS THAT ALLOW YOU TO DOWNLOAD FLACS OR MP3S,
# AND FOR 'STREAM ONLY' SHOWS THERE IS OUR BELOVED GRATEFUL GRABBER. BUT WHAT IF YOU WANT EVERY SHOW? YES, EVERY SHOW. OR EVERY SHOW OF A PARTICULAR YEAR.
# PARANOID BY NATURE, I BEGAN IMAGINING A DISASTER SCENARIO WHERE THE ZOMBIE (OR OTHER) APOCALYPSE TAKES DOWN THE INTERNET,
# AND WITH IT, OUR BELOVED ARCHIVE. IN THAT BRAVE NEW LANDSCAPE I STILL WANT TO DISCOVER NEW SHOWS AND LISTEN TO THEM AS I PLEASE,
# ON A GENERATOR POWERED STEREO, AS I WAIT FOR THE UNKNOWN.

# AS OF 11/20/2019 THERE ARE EXACTLY 14000 SHOWS IN THE ARCHIVE'S GRATEFUL DEAD COLLECTION.
# IF YOU HAVE A GOOD INTERNET CONNECTION AND PLENTY OF STORAGE SPACE, YOU CAN DOWNLOAD ALL 14000 SHOWS WITH 7 LINES OF CODE.
# THOUGH SOME MAY FIND IT MORE MANAGEABLE TO SEGMENT AND DOWNLOAD SHOWS BY YEAR.

# AFTER EXTENSIVE RESEARCH AND MUCH TRIAL AND ERROR I STUMBLED UPON THESE RESOURCES WHICH LED TO THE SOLUTION:

# ARCHIVE HAS IT'S OWN PYTHON API FOR DOWNLOADING STUFF??? THIS IS WAY EASIER THAN THE BEAUTIFUL SOUP STUFF I WAS TRYING:
# https://gareth.halfacree.co.uk/2013/04/bulk-downloading-collections-from-archive-org

# HOW TO OBTAIN LISTS OF IDENTIFIERS USED TO DOWNLOAD SHOWS:
# https://blog.archive.org/2012/04/26/downloading-in-bulk-using-wget/

# internetarchive API QUICK START GUIDE:
# https://archive.org/services/docs/api/internetarchive/quickstart.html

# PRE-REQUISITES:

# DOWNLOAD AND INSTALL PYTHON: https://www.python.org/downloads/

# DOWNLOAD A PYTHON IDE (OPTIONAL): https://www.jetbrains.com/pycharm/

# GET internetarchive PACKAGE: (from command line) pip install internetarchive

# CREATE CONFIG FILE WITH ARCHIVE.ORG CREDENTIALS:

# (from command line) ia configure

# Enter your archive.org credentials below to configure 'ia'.
#
# Email address: [email protected]
# Password:
#
# Config saved to: /home/user/.config/ia.ini

# OBTAIN CSV LIST OF ALL SHOWS:

# GO TO ARCHIVE.ORG: UNDER THE SEARCH BAR CLICK 'Advanced Search'; IN THE UPPER 'ADVANCED SEARCH' BLOCK: SEARCH:

# AND Collection: is GratefulDead <--must type exactly this

# HIT SEARCH; THIS REDIRECTS TO SEARCH RESULTS WITH SYNTAX IN SEARCH FIELD: collection:(GratefulDead)
# COPY SEARCH SYNTAX AND GO BACK TO ADVANCED SEARCH PAGE
#
# IN LOWER 'Advanced Search returning JSON, XML, and more' BLOCK: PASTE: collection:(GratefulDead) INTO 'Query' FIELD
# SELECT 'identifier' IN 'Fields to return' list
# CHANGE 'NUMBER OF RESULTS' TO 15000
# SELECT 'CSV FORMAT'
# HIT SEARCH
# HIT 'OK' ON POP-UP NOTE
# OPEN CSV FILE, DELETE 'IDENTIFIER' COLUMN HEADER
# SAVE, CLOSE
# SET PATH TO CSV FILE (HARDCODE IN .PY FILE)
# SET PATH TO LOCAL DIRECTORY TO SAVE FILES (HARDCODE IN .PY FILE)
# RUN SELECTED .PY FILE FROM COMMAND LINE
# OPTION 1: DOWNLOAD ALL 14000 GRATEFUL DEAD SHOWS AT ONCE: download_all_gd.py
# OPTION 2: SEGMENT AND DOWNLOAD SHOWS BY YEAR: download_gd_by_year.py

# NOTE TO SELF: COLLECTION NAME FOR DEAD & COMPANY: collection:(DeadAndCompany)
# NOTE TO ALL: YOU CAN FIND THESE 'COLLECTION NAMES' ON ARCHIVE, YOU JUST GOTTA POKE AROUND

# STATS FROM SEGMENTATION AND DOWNLOAD BY YEAR BASED ON 1975:

# INTERNET CONNECTION DETAILS:
# DOWNLOAD Mbps: 887.41
# UPLOAD Mbps: 839.64
# https://www.speedtest.net/

# 1975: 55 shows
# time to download all 1975: 81.11 minutes
# download time per show: 1.47 minutes
# total size of 1975: 6.35 GB
# average file size per show: 115.45 MB

# Estimate regarding all 14000 shows based on 1975 stats:
# Approx: 1.62 TB TOTAL
# Approx: 344 Total hours to download (14 days!)
# MILEAGE WILL VARY

# BREAKDOWN: NUMBER OF SHOWS BY YEAR (as of 11/20/2019):
# 65: 3 shows
# 66: 69 shows
# 67: 48 shows
# 68: 113 shows
# 69: 291 shows
# 70: 340 shows
# 71: 307 shows
# 72: 325 shows
# 73: 379 shows
# 74: 270 shows
# 75: 55 shows
# 76: 251 shows
# 77: 352 shows
# 78: 462 shows
# 79: 453 shows
# 80: 613 shows
# 81: 638 shows
# 82: 524 shows
# 83: 812 shows
# 84: 688 shows
# 85: 874 shows
# 86: 497 shows
# 87: 772 shows
# 88: 684 shows
# 89: 854 shows
# 90: 933 shows
# 91: 682 shows
# 92: 406 shows
# 93: 576 shows
# 94: 412 shows
# 95: 315 shows
# gdnrps: 2 shows
# Total: 14000 shows

# TESTING SOME FUNCTIONALITY IN THE internetarchive API

#-----------------------------------------------------------------------------------------------------------------------

# TESTING:

# DOWNLOADABLE SHOW

#THIS IS THE IDENTIFIER
#download('gd1977-11-05.aud.zimmerman.minches.81180.sbeok.flac16', verbose=True, glob_pattern='*.mp3') # SUCCESS

#-----------------------------------------------------------------------------------------------------------------------

# TESTING:

# STREAM-ONLY SHOW

#THIS IS THE IDENTIFIER
#download('gd73-06-10.sbd.hollister.174.sbeok.shnf', verbose=True, glob_pattern='*.mp3', destdir=r"H:\gd") # SUCCESS

#-----------------------------------------------------------------------------------------------------------------------

37 changes: 37 additions & 0 deletions download_all_gd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from internetarchive import get_item
from internetarchive import download
from internetarchive import search_items
import time as t

#-----------------------------------------------------------------------------------------------------------------------

# START TIME OF JOB

start = t.time()

#-----------------------------------------------------------------------------------------------------------------------

# READ CSV FILE INTO LIST

f = open(r"H:\14000.csv") # PATH TO CSV FILE
x = f.readlines()
f.close()

with open(r"H:\14000.csv") as f:
x = f.read().splitlines()

#-----------------------------------------------------------------------------------------------------------------------

# DOWNLOAD ALL 14000 GRATEFUL DEAD SHOWS AT ONCE

for a in x:
download(a, verbose=True, glob_pattern='*.mp3', destdir=r"C:\Users\username\Desktop\gd") # LOCAL DIRECTORY TO SAVE FILES

#-----------------------------------------------------------------------------------------------------------------------

# END TIME OF JOB

end = t.time()
print('time to complete: ' + str((end - start) / 60) + ' minutes')

#-----------------------------------------------------------------------------------------------------------------------
135 changes: 135 additions & 0 deletions download_gd_by_year.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
from internetarchive import get_item
from internetarchive import download
from internetarchive import search_items
import time as t

#-----------------------------------------------------------------------------------------------------------------------

# START TIME OF JOB

start = t.time()

#-----------------------------------------------------------------------------------------------------------------------

# READ CSV FILE INTO LIST

f = open(r"H:\14000.csv") # PATH TO CSV FILE
x = f.readlines()
f.close()

with open(r"H:\14000.csv") as f:
x = f.read().splitlines()

#-----------------------------------------------------------------------------------------------------------------------

# SEGMENT BY YEAR

gd65_matchers = ['gd65', 'gd1965']
gd65_matching = [s for s in x if any(xs in s for xs in gd65_matchers)]

gd66_matchers = ['gd66', 'gd1966']
gd66_matching = [s for s in x if any(xs in s for xs in gd66_matchers)]

gd67_matchers = ['gd67', 'gd1967']
gd67_matching = [s for s in x if any(xs in s for xs in gd67_matchers)]

gd68_matchers = ['gd68', 'gd1968']
gd68_matching = [s for s in x if any(xs in s for xs in gd68_matchers)]

gd69_matchers = ['gd69', 'gd1969']
gd69_matching = [s for s in x if any(xs in s for xs in gd69_matchers)]

gd70_matchers = ['gd70', 'gd1970']
gd70_matching = [s for s in x if any(xs in s for xs in gd70_matchers)]

gd71_matchers = ['gd71', 'gd1971']
gd71_matching = [s for s in x if any(xs in s for xs in gd71_matchers)]

gd72_matchers = ['gd72', 'gd1972']
gd72_matching = [s for s in x if any(xs in s for xs in gd72_matchers)]

gd73_matchers = ['gd73', 'gd1973']
gd73_matching = [s for s in x if any(xs in s for xs in gd73_matchers)]

gd74_matchers = ['gd74', 'gd1974']
gd74_matching = [s for s in x if any(xs in s for xs in gd74_matchers)]

gd75_matchers = ['gd75', 'gd1975']
gd75_matching = [s for s in x if any(xs in s for xs in gd75_matchers)]

gd76_matchers = ['gd76', 'gd1976']
gd76_matching = [s for s in x if any(xs in s for xs in gd76_matchers)]

gd77_matchers = ['gd77', 'gd1977']
gd77_matching = [s for s in x if any(xs in s for xs in gd77_matchers)]

gd78_matchers = ['gd78', 'gd1978']
gd78_matching = [s for s in x if any(xs in s for xs in gd78_matchers)]

gd79_matchers = ['gd79', 'gd1979']
gd79_matching = [s for s in x if any(xs in s for xs in gd79_matchers)]

gd80_matchers = ['gd80', 'gd1980']
gd80_matching = [s for s in x if any(xs in s for xs in gd80_matchers)]

gd81_matchers = ['gd81', 'gd1981']
gd81_matching = [s for s in x if any(xs in s for xs in gd81_matchers)]

gd82_matchers = ['gd82', 'gd1982']
gd82_matching = [s for s in x if any(xs in s for xs in gd82_matchers)]

gd83_matchers = ['gd83', 'gd1983']
gd83_matching = [s for s in x if any(xs in s for xs in gd83_matchers)]

gd84_matchers = ['gd84', 'gd1984']
gd84_matching = [s for s in x if any(xs in s for xs in gd84_matchers)]

gd85_matchers = ['gd85', 'gd1985']
gd85_matching = [s for s in x if any(xs in s for xs in gd85_matchers)]

gd86_matchers = ['gd86', 'gd1986']
gd86_matching = [s for s in x if any(xs in s for xs in gd86_matchers)]

gd87_matchers = ['gd87', 'gd1987']
gd87_matching = [s for s in x if any(xs in s for xs in gd87_matchers)]

gd88_matchers = ['gd88', 'gd1988']
gd88_matching = [s for s in x if any(xs in s for xs in gd88_matchers)]

gd89_matchers = ['gd89', 'gd1989']
gd89_matching = [s for s in x if any(xs in s for xs in gd89_matchers)]

gd90_matchers = ['gd90', 'gd1990']
gd90_matching = [s for s in x if any(xs in s for xs in gd90_matchers)]

gd91_matchers = ['gd91', 'gd1991']
gd91_matching = [s for s in x if any(xs in s for xs in gd91_matchers)]

gd92_matchers = ['gd92', 'gd1992']
gd92_matching = [s for s in x if any(xs in s for xs in gd92_matchers)]

gd93_matchers = ['gd93', 'gd1993']
gd93_matching = [s for s in x if any(xs in s for xs in gd93_matchers)]

gd94_matchers = ['gd94', 'gd1994']
gd94_matching = [s for s in x if any(xs in s for xs in gd94_matchers)]

gd95_matchers = ['gd95', 'gd1995']
gd95_matching = [s for s in x if any(xs in s for xs in gd95_matchers)]

gdnrps_matchers = ['gd_nrps']
gdnrps_matching = [s for s in x if any(xs in s for xs in gdnrps_matchers)]

# DOWNLOAD BY YEAR

for a in gd75_matching: ## YEAR TO DOWNLOAD; CHANGE AS DESIRED
download(a, verbose=True, glob_pattern='*.mp3', destdir=r"C:\Users\username\Desktop\gd") # LOCAL DIRECTORY TO SAVE FILES

#-----------------------------------------------------------------------------------------------------------------------

# END TIME OF JOB

end = t.time()
print('time to complete: ' + str((end - start) / 60) + ' minutes')

#-----------------------------------------------------------------------------------------------------------------------

0 comments on commit ffa0c08

Please sign in to comment.