First commit

Script is functional for ressources and folders.
This commit is contained in:
2024-01-19 19:04:20 +00:00
commit 3dc76ea34c
4 changed files with 311 additions and 0 deletions

151
MoodleCourse.py Normal file
View File

@@ -0,0 +1,151 @@
import requests
from bs4 import BeautifulSoup
import bs4
import os
from urllib.parse import unquote
class MoodleActivity:
def __init__(self, soup):
self.soup = soup
self.name = None
self.url = None
self.type = None
self.parent_dir = None
self.all_urls = self._get_all_urls()
if self.all_urls:
self.init()
def __repr__(self):
return f"{self.type}: {self.name}"
def init(self):
self.name = self._get_name()
self.type = self._get_type()
self.url = self._get_url()
def _get_all_urls(self):
return list(map(lambda x: x["href"], self.soup.find_all("a")))
def _get_name(self):
link = self.soup.find("a")
text = link.text
toremove = link.find("span", {"class": "accesshide"})
if toremove is None:
name = text.strip()
else:
name = text[:text.find(toremove.text)].strip()
return unquote(name)[:min(len(name), 50)]
def _get_url(self):
url = self.soup.find("a")["href"]
if url in self.all_urls:
self.all_urls.remove(url)
return url
def _get_type(self):
return self.soup["class"][2]
def set_parent_dir(self, parent_dir: os.path):
self.parent_dir = parent_dir
def find_file(self, s: requests.Session) -> bool:
r = s.get(self.url, allow_redirects=True)
if r.url.find("view.php?id=")>0:
try:
soup = BeautifulSoup(r.text, 'html.parser')
main_region = soup.find("section", {"id": "region-main"})
if main_region is not None:
link = main_region.find("a")
if link is not None:
self.url = link["href"]
return True
else:
img = main_region.find("img")
if img is not None:
self.url = img["src"]
return True
else:
return False
else:
frames = soup.find_all("frame")
if len(frames) > 1:
self.url = frames[1]["src"]
return True
else:
return False
except Exception as e:
return False
else:
self.url = r.url.split("?")[0]
return True
def find_folder(self, s: requests.Session):
try:
form = self.soup.find("form")
url = form["action"] + "?id=" + form.find("input")["value"]
if url.find("download_folder.php") > 0:
self.url = url
return True
else:
return False
except:
r = s.get(self.url, allow_redirects=True)
if r.url.find("view.php?id=") > 0:
url = r.url.replace("view.php", "download_folder.php")
self.url = url
return True
else:
return False
class MoodleCourseSection:
def __init__(self, soup: bs4.element.Tag):
self.soup = soup
self.title = self._get_title()
self.activities = self._get_activities()
def __repr__(self):
return f"{self.title}: {len(self.activities)} activities"
def _get_title(self) -> str:
title = self.soup.find("h3", {"class": "sectionname"}).text.strip()
return unquote(title)[:min(len(title), 50)]
def _get_activities(self) -> list[MoodleActivity]:
activities_raw = self.soup.find_all("li", {"class": "activity"})
return list(map(lambda x: MoodleActivity(x), activities_raw))
def set_parent_dir(self, parent_dir: os.path, index: int):
for act in self.activities:
act.set_parent_dir(os.path.join(parent_dir, f"{index} - {self.title}"))
class MoodleCourse:
def __init__(self, name_with_code: str, url: str):
self.url = url
self.code = unquote(name_with_code.split()[0])
self.name = unquote(" ".join(name_with_code.split()[1:]).strip())
self.soup = None
self.sections = None
def __repr__(self):
return f"{self.code} {self.name}: {len(self.sections)} sections"
def init(self, s: requests.Session):
self.soup = self._get_soup(s)
self.sections = self._get_sections()
def _get_soup(self, s: requests.Session) -> BeautifulSoup:
r = s.get(self.url)
return BeautifulSoup(r.text, 'html.parser')
def _get_sections(self) -> list[MoodleCourseSection]:
sections_raw = self.soup.find_all("li", {"class": "course-section"})
return list(map(lambda x: MoodleCourseSection(x), sections_raw))
def set_parent_dir(self, parent_dir, year: str = None):
if year is None:
year = self.url.split("/")[3]
for i, sec in enumerate(self.sections):
sec.set_parent_dir(os.path.join(parent_dir, f"{self.code} - {self.name}", year), i)

42
README.md Normal file
View File

@@ -0,0 +1,42 @@
# MoodleScraper
A simple tool to download as many files from moodle as possible.
This was designed for me to easily build an archive of my courses at EPFL. It is still in development and was not built for other moodles.
## Requirements
This script was written in python 3.10.4 and the following packages:
| package | version |
|----------------|---------|
| beautifulsoup4 | 4.12.2 |
| requests | 2.31.0 |
## Features
This script should be able to download all file directly linked to in moodle ressources and folders. It is not able to download files from secondary sources.
## Usage
The script is pretty simple to use. Just provide a moodle link and authentication information like this:
```
python main.py -u USER -p PASSWORD -d /moodle -m https://moodlearchive.epfl.ch/2019-2020/
```
If providing an url that doesn't end with the year, you must specify the year like this:
```
python main.py -u USER -p PASSWORD -d /moodle -m https://moodle.epfl.ch/ -y 2023-2024
```
For speed or storage reasons you can tell the script to not download a specific kind of file:
```
python main.py -u USER -p PASSWORD -d /moodle -m https://moodlearchive.epfl.ch/2019-2020/ -i mp4 -i mov
```
| argument | short | example |
|--------------------|-------|-------------------------|
| --username | -u | username |
| --password | -p | superstrongpassword |
| --moodle_url | -m | https://moodle.epfl.ch/ |
| --academic_year | -y | 2023-2024 |
| --storage_dir | -d | /moodle |
| --ignore_extension | -i | mp4 |
## Planned
- [ ] Download submission files
- [ ] Create url files for secondary urls
-

118
main.py Normal file
View File

@@ -0,0 +1,118 @@
import requests
from bs4 import BeautifulSoup
from MoodleCourse import MoodleCourse
import os
import pathvalidate
from urllib.parse import unquote
def get_moodle_auth_cookie(s: requests.Session, url: str) -> requests.Session:
s.get(f"{url}login/index.php")
s.post(f"https://tequila.epfl.ch/cgi-bin/tequila/login", data={
"requestkey": s.cookies["TequilaPHP"],
"username": args.username,
"password": args.password
})
return s
def get_courses(s: requests.Session, url: str) -> list[MoodleCourse]:
r = s.get(f"{url}user/profile.php?showallcourses=1")
soup = BeautifulSoup(r.text, 'html.parser')
course_node = soup.find_all("section", {"class": "node_category card d-inline-block w-100 mb-3"})[1]
course_list = course_node.find_all("a")
course_names = list(map(lambda x: x.text.strip(), course_list))
course_list = list(map(lambda x: x["href"], course_list))
course_urls = list(map(lambda x: f"{url}course/view.php?id={x.split('&')[-2].split('=')[1]}", course_list))
return list(map(lambda x: MoodleCourse(x[0], x[1]), zip(course_names, course_urls)))
def download_resource(s, activity):
directory = os.path.join(activity.parent_dir, activity.name)
directory = pathvalidate.sanitize_filepath(directory)
file = os.path.join(directory, unquote(activity.url.split('/')[-1]))
file = pathvalidate.sanitize_filepath(file)
if args.ignore_extension is not None and str(file).split(".")[-1] in args.ignore_extension:
return True
if os.path.exists(os.path.join(file)):
return True
if not os.path.exists(directory):
os.makedirs(directory)
try:
with s.get(activity.url, stream=True) as r:
with open(file+".part", 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
os.rename(file+".part", file)
if os.path.exists(file + ".part"):
os.remove(file + ".part")
except:
if os.path.exists(file+".part"):
os.remove(file+".part")
return False
finally:
return True
def download_folder(s, activity):
directory = os.path.join(activity.parent_dir, activity.name)
directory = pathvalidate.sanitize_filepath(directory)
try:
with s.get(activity.url, stream=True) as r:
file = os.path.join(directory, unquote(r.headers['Content-Disposition'].split('\'')[-1]))
file = pathvalidate.sanitize_filepath(file)
if os.path.exists(os.path.join(file)):
return True
if not os.path.exists(directory):
os.makedirs(directory)
with open(file + ".part", 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
os.rename(file + ".part", file)
if os.path.exists(file + ".part"):
os.remove(file + ".part")
except:
if os.path.exists(file+".part"):
os.remove(file+".part")
return False
finally:
return True
def download_year(target_url: str):
s = requests.Session()
s = get_moodle_auth_cookie(s, target_url)
courses = get_courses(s, target_url)
for course in courses:
course.init(s)
print(target_url, course.name)
course.set_parent_dir(args.storage_dir, args.academic_year)
for section in course.sections:
for activity in section.activities:
if activity.type == "resource":
if activity.find_file(s):
if not download_resource(s, activity):
print("Could not download: ", activity.url)
else:
print("Could not find: ", activity.url)
elif activity.type == "folder":
if activity.find_folder(s):
if not download_folder(s, activity):
print("Could not download: ", activity.url)
else:
print("Could not find: ", activity.url)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Moodle to download')
parser.add_argument('--username', '-u')
parser.add_argument('--password', '-p')
parser.add_argument('--moodle_url', '--url', '-m', default='https://moodle.epfl.ch/')
parser.add_argument('--academic_year', '--year', '-y', default=None)
parser.add_argument('--storage_dir', '--dir', '-d', default='.')
parser.add_argument('--ignore_extension', '-i', nargs='*')
global args
args = parser.parse_args()
download_year(args.moodle_url)

BIN
requirement.txt Normal file

Binary file not shown.