From 3dc76ea34c5d62bb6230d6fa35e6f827d6e3da23 Mon Sep 17 00:00:00 2001 From: edufour Date: Fri, 19 Jan 2024 19:04:20 +0000 Subject: [PATCH] First commit Script is functional for ressources and folders. --- MoodleCourse.py | 151 ++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 42 ++++++++++++++ main.py | 118 +++++++++++++++++++++++++++++++++++++ requirement.txt | Bin 0 -> 1592 bytes 4 files changed, 311 insertions(+) create mode 100644 MoodleCourse.py create mode 100644 README.md create mode 100644 main.py create mode 100644 requirement.txt diff --git a/MoodleCourse.py b/MoodleCourse.py new file mode 100644 index 0000000..72fa082 --- /dev/null +++ b/MoodleCourse.py @@ -0,0 +1,151 @@ +import requests +from bs4 import BeautifulSoup +import bs4 +import os +from urllib.parse import unquote + + +class MoodleActivity: + def __init__(self, soup): + self.soup = soup + self.name = None + self.url = None + self.type = None + self.parent_dir = None + self.all_urls = self._get_all_urls() + if self.all_urls: + self.init() + + def __repr__(self): + return f"{self.type}: {self.name}" + + def init(self): + self.name = self._get_name() + self.type = self._get_type() + self.url = self._get_url() + + def _get_all_urls(self): + return list(map(lambda x: x["href"], self.soup.find_all("a"))) + + def _get_name(self): + link = self.soup.find("a") + text = link.text + toremove = link.find("span", {"class": "accesshide"}) + if toremove is None: + name = text.strip() + else: + name = text[:text.find(toremove.text)].strip() + return unquote(name)[:min(len(name), 50)] + + def _get_url(self): + url = self.soup.find("a")["href"] + if url in self.all_urls: + self.all_urls.remove(url) + return url + + def _get_type(self): + return self.soup["class"][2] + + def set_parent_dir(self, parent_dir: os.path): + self.parent_dir = parent_dir + + def find_file(self, s: requests.Session) -> bool: + r = s.get(self.url, allow_redirects=True) + if r.url.find("view.php?id=")>0: + try: + soup = BeautifulSoup(r.text, 'html.parser') + main_region = soup.find("section", {"id": "region-main"}) + if main_region is not None: + link = main_region.find("a") + if link is not None: + self.url = link["href"] + return True + else: + img = main_region.find("img") + if img is not None: + self.url = img["src"] + return True + else: + return False + else: + frames = soup.find_all("frame") + if len(frames) > 1: + self.url = frames[1]["src"] + return True + else: + return False + except Exception as e: + return False + else: + self.url = r.url.split("?")[0] + return True + + def find_folder(self, s: requests.Session): + try: + form = self.soup.find("form") + url = form["action"] + "?id=" + form.find("input")["value"] + if url.find("download_folder.php") > 0: + self.url = url + return True + else: + return False + except: + r = s.get(self.url, allow_redirects=True) + if r.url.find("view.php?id=") > 0: + url = r.url.replace("view.php", "download_folder.php") + self.url = url + return True + else: + return False + + +class MoodleCourseSection: + def __init__(self, soup: bs4.element.Tag): + self.soup = soup + self.title = self._get_title() + self.activities = self._get_activities() + + def __repr__(self): + return f"{self.title}: {len(self.activities)} activities" + + def _get_title(self) -> str: + title = self.soup.find("h3", {"class": "sectionname"}).text.strip() + return unquote(title)[:min(len(title), 50)] + + def _get_activities(self) -> list[MoodleActivity]: + activities_raw = self.soup.find_all("li", {"class": "activity"}) + return list(map(lambda x: MoodleActivity(x), activities_raw)) + + def set_parent_dir(self, parent_dir: os.path, index: int): + for act in self.activities: + act.set_parent_dir(os.path.join(parent_dir, f"{index} - {self.title}")) + + +class MoodleCourse: + def __init__(self, name_with_code: str, url: str): + self.url = url + self.code = unquote(name_with_code.split()[0]) + self.name = unquote(" ".join(name_with_code.split()[1:]).strip()) + self.soup = None + self.sections = None + + def __repr__(self): + return f"{self.code} {self.name}: {len(self.sections)} sections" + + def init(self, s: requests.Session): + self.soup = self._get_soup(s) + self.sections = self._get_sections() + + def _get_soup(self, s: requests.Session) -> BeautifulSoup: + r = s.get(self.url) + return BeautifulSoup(r.text, 'html.parser') + + def _get_sections(self) -> list[MoodleCourseSection]: + sections_raw = self.soup.find_all("li", {"class": "course-section"}) + return list(map(lambda x: MoodleCourseSection(x), sections_raw)) + + def set_parent_dir(self, parent_dir, year: str = None): + if year is None: + year = self.url.split("/")[3] + for i, sec in enumerate(self.sections): + sec.set_parent_dir(os.path.join(parent_dir, f"{self.code} - {self.name}", year), i) diff --git a/README.md b/README.md new file mode 100644 index 0000000..59addba --- /dev/null +++ b/README.md @@ -0,0 +1,42 @@ +# MoodleScraper +A simple tool to download as many files from moodle as possible. +This was designed for me to easily build an archive of my courses at EPFL. It is still in development and was not built for other moodles. + +## Requirements +This script was written in python 3.10.4 and the following packages: + +| package | version | +|----------------|---------| +| beautifulsoup4 | 4.12.2 | +| requests | 2.31.0 | + +## Features +This script should be able to download all file directly linked to in moodle ressources and folders. It is not able to download files from secondary sources. + +## Usage +The script is pretty simple to use. Just provide a moodle link and authentication information like this: +``` +python main.py -u USER -p PASSWORD -d /moodle -m https://moodlearchive.epfl.ch/2019-2020/ +``` +If providing an url that doesn't end with the year, you must specify the year like this: +``` +python main.py -u USER -p PASSWORD -d /moodle -m https://moodle.epfl.ch/ -y 2023-2024 +``` +For speed or storage reasons you can tell the script to not download a specific kind of file: +``` +python main.py -u USER -p PASSWORD -d /moodle -m https://moodlearchive.epfl.ch/2019-2020/ -i mp4 -i mov +``` + +| argument | short | example | +|--------------------|-------|-------------------------| +| --username | -u | username | +| --password | -p | superstrongpassword | +| --moodle_url | -m | https://moodle.epfl.ch/ | +| --academic_year | -y | 2023-2024 | +| --storage_dir | -d | /moodle | +| --ignore_extension | -i | mp4 | + +## Planned +- [ ] Download submission files +- [ ] Create url files for secondary urls +- diff --git a/main.py b/main.py new file mode 100644 index 0000000..fb48efa --- /dev/null +++ b/main.py @@ -0,0 +1,118 @@ +import requests +from bs4 import BeautifulSoup +from MoodleCourse import MoodleCourse +import os +import pathvalidate +from urllib.parse import unquote + + +def get_moodle_auth_cookie(s: requests.Session, url: str) -> requests.Session: + s.get(f"{url}login/index.php") + s.post(f"https://tequila.epfl.ch/cgi-bin/tequila/login", data={ + "requestkey": s.cookies["TequilaPHP"], + "username": args.username, + "password": args.password + }) + return s + + +def get_courses(s: requests.Session, url: str) -> list[MoodleCourse]: + r = s.get(f"{url}user/profile.php?showallcourses=1") + soup = BeautifulSoup(r.text, 'html.parser') + course_node = soup.find_all("section", {"class": "node_category card d-inline-block w-100 mb-3"})[1] + course_list = course_node.find_all("a") + course_names = list(map(lambda x: x.text.strip(), course_list)) + course_list = list(map(lambda x: x["href"], course_list)) + course_urls = list(map(lambda x: f"{url}course/view.php?id={x.split('&')[-2].split('=')[1]}", course_list)) + return list(map(lambda x: MoodleCourse(x[0], x[1]), zip(course_names, course_urls))) + + +def download_resource(s, activity): + directory = os.path.join(activity.parent_dir, activity.name) + directory = pathvalidate.sanitize_filepath(directory) + file = os.path.join(directory, unquote(activity.url.split('/')[-1])) + file = pathvalidate.sanitize_filepath(file) + if args.ignore_extension is not None and str(file).split(".")[-1] in args.ignore_extension: + return True + if os.path.exists(os.path.join(file)): + return True + if not os.path.exists(directory): + os.makedirs(directory) + try: + with s.get(activity.url, stream=True) as r: + with open(file+".part", 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + os.rename(file+".part", file) + if os.path.exists(file + ".part"): + os.remove(file + ".part") + except: + if os.path.exists(file+".part"): + os.remove(file+".part") + return False + finally: + return True + + +def download_folder(s, activity): + directory = os.path.join(activity.parent_dir, activity.name) + directory = pathvalidate.sanitize_filepath(directory) + try: + with s.get(activity.url, stream=True) as r: + file = os.path.join(directory, unquote(r.headers['Content-Disposition'].split('\'')[-1])) + file = pathvalidate.sanitize_filepath(file) + if os.path.exists(os.path.join(file)): + return True + if not os.path.exists(directory): + os.makedirs(directory) + with open(file + ".part", 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + os.rename(file + ".part", file) + if os.path.exists(file + ".part"): + os.remove(file + ".part") + except: + if os.path.exists(file+".part"): + os.remove(file+".part") + return False + finally: + return True + + +def download_year(target_url: str): + s = requests.Session() + s = get_moodle_auth_cookie(s, target_url) + courses = get_courses(s, target_url) + for course in courses: + course.init(s) + print(target_url, course.name) + course.set_parent_dir(args.storage_dir, args.academic_year) + for section in course.sections: + for activity in section.activities: + if activity.type == "resource": + if activity.find_file(s): + if not download_resource(s, activity): + print("Could not download: ", activity.url) + else: + print("Could not find: ", activity.url) + elif activity.type == "folder": + if activity.find_folder(s): + if not download_folder(s, activity): + print("Could not download: ", activity.url) + else: + print("Could not find: ", activity.url) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='Moodle to download') + parser.add_argument('--username', '-u') + parser.add_argument('--password', '-p') + parser.add_argument('--moodle_url', '--url', '-m', default='https://moodle.epfl.ch/') + parser.add_argument('--academic_year', '--year', '-y', default=None) + parser.add_argument('--storage_dir', '--dir', '-d', default='.') + parser.add_argument('--ignore_extension', '-i', nargs='*') + global args + args = parser.parse_args() + download_year(args.moodle_url) diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000000000000000000000000000000000000..c92b709b531e1795cc28fc2b3444b67cf9194a79 GIT binary patch literal 1592 zcmZ{kL2uJQ5QXQA#7{{ThqS&4K~_4T`OHzelP8%y|DLor`66525v?awiGTf8>_VnuCcA2t++Zpdt3XC zwU9;Pg|l+5|HrJvHzc+u#awFtE9WWv_qD656mP4j=!)+*ld8)0ibGztxE6Mlror=Q zwd2uaun$)5RbuRf9JQm=^Q2sUB?S?ktk(%SXss8fvz?erz3&rx?P|bgoufEW((10X zCo9J7aIv;H3)E`T7I7QHMmND?+St*CfbC>d(y8uLVPUK8<@`&hJ z*S%0UOV;F^yJ@cJtynr=+v&+VT8Ic`;pea_u=+<(`G| zs6b3}=)P6757L{RWkPbLLUZm{B7YJWmH({kqpOPvoYnu7n%Zw%*f<8Kn8BDP`X^9K zR6b#FAnHk$|6&$C(PWvses+Wj8th)tn3RQkAKgiBJ)P9-@hH@J;woHSxSvlc5Vcq% zB2HX;mofcx=~>*&cb?naapCG0d(eLJtA$2$oS~N!a(aS6H&fed7pG*ef1GzYU+NlX z3_0~e)1zOL67ofHoKx(fv)}5DtnS3sdS7njbLgRS^ZJ9n&h;W&)}mgIVx{U=9y8{6 M>zRi)R5r5y3pz6DumAu6 literal 0 HcmV?d00001