forked from john-hix/scrape-canvas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
canvas-scraper.py
123 lines (105 loc) · 4.92 KB
/
canvas-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!bin/python3
import argparse
import os
import re
from pathvalidate import sanitize_filename
from canvasapi import Canvas
from canvasapi.course import Course
from canvasapi.exceptions import Unauthorized, ResourceDoesNotExist
from canvasapi.file import File
from canvasapi.module import Module, ModuleItem
def extract_files(text):
text_search = re.findall("/files/(\\d+)", text, re.IGNORECASE)
groups = set(text_search)
return groups
def get_course_files(course):
modules = course.get_modules()
files_downloaded = set() # Track downloaded files for this course to avoid duplicates
for module in modules:
module: Module = module
module_items = module.get_module_items()
for item in module_items:
item: ModuleItem = item
path = f"{output}/" \
f"{sanitize_filename(course.attributes['name'])}/" \
f"{sanitize_filename(module.attributes['name'])}/"
if not os.path.exists(path):
os.makedirs(path)
item_type = item.attributes["type"]
print(f"{course.attributes['name']} - "
f"{module.attributes['name']} - "
f"{item.attributes['title']} ({item_type})")
if item_type == "File":
file = canvas.get_file(item.attributes["content_id"])
files_downloaded.add(item.attributes["content_id"])
file.download(path + sanitize_filename(file.attributes['filename']))
elif item_type == "Page":
page = course.get_page(item.attributes["page_url"])
with open(path + sanitize_filename(item.attributes['title']) + ".html", "w", encoding="utf-8") as f:
f.write(page.attributes["body"] or "")
files = extract_files(page.attributes["body"] or "")
for file_id in files:
if file_id in files_downloaded:
continue
try:
file = course.get_file(file_id)
files_downloaded.add(file_id)
file.download(path + sanitize_filename(file.attributes['filename']))
except ResourceDoesNotExist:
pass
elif item_type == "ExternalUrl":
url = item.attributes["external_url"]
with open(path + sanitize_filename(item.attributes['title']) + ".url", "w") as f:
f.write("[InternetShortcut]\n")
f.write("URL=" + url)
elif item_type == "Assignment":
assignment = course.get_assignment(item.attributes["content_id"])
with open(path + sanitize_filename(item.attributes['title']) + ".html", "w", encoding="utf-8") as f:
f.write(assignment.attributes["description"] or "")
files = extract_files(assignment.attributes["description"] or "")
for file_id in files:
if file_id in files_downloaded:
continue
try:
file = course.get_file(file_id)
files_downloaded.add(file_id)
file.download(path + sanitize_filename(file.attributes['filename']))
except ResourceDoesNotExist:
pass
try:
files = course.get_files()
for file in files:
file: File = file
if not file.attributes["id"] in files_downloaded:
print(f"{course.attributes['name']} - {file.attributes['filename']}")
path = f"{output}/{sanitize_filename(course.attributes['name'])}/" \
f"{sanitize_filename(file.attributes['filename'])}"
file.download(path)
except Unauthorized:
pass
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download all content from Canvas")
parser.add_argument("url", help="URL to the Canvas website, e.g. https://canvas.utwente.nl")
parser.add_argument("token", help="Token generated in the settings page on Canvas")
parser.add_argument("output", help="Path to the output folder, e.g. output/")
parser.add_argument("courses", help="Comma-separated course IDs or 'all'", nargs="?", const="all")
args = parser.parse_args()
# Handle args
output = args.output.rstrip("/") + "/"
if args.courses is None:
args.courses = "all"
print("No courses specified. Scraping all courses.")
canvas = Canvas(args.url, args.token)
courses = [] # courses to scrape
# Select courses to scrape, default to all
if args.courses != "all":
courses = []
ids = args.courses.split(",")
for id in ids:
courses.append(canvas.get_course( int(id) ))
else:
courses = canvas.get_courses()
# Perform scrape
for course in courses:
course: Course = course
get_course_files(course)