From 8e10323ac3664a1ddb6d9393a9651c7cfb9a44ac Mon Sep 17 00:00:00 2001 From: edufour Date: Sat, 27 Jan 2024 01:02:41 +0100 Subject: [PATCH] Changing moodle activity handling --- MoodleClasses/MoodleActivity.py | 32 ++++++ MoodleClasses/MoodleAssign.py | 36 +++++++ MoodleClasses/MoodleCourse.py | 35 +++++++ MoodleClasses/MoodleCourseSection.py | 46 ++++++++ MoodleClasses/MoodleFile.py | 56 ++++++++++ MoodleClasses/MoodleFolder.py | 47 +++++++++ MoodleClasses/MoodleResource.py | 61 +++++++++++ MoodleCourse.py | 151 --------------------------- main.py | 103 ++---------------- 9 files changed, 320 insertions(+), 247 deletions(-) create mode 100644 MoodleClasses/MoodleActivity.py create mode 100644 MoodleClasses/MoodleAssign.py create mode 100644 MoodleClasses/MoodleCourse.py create mode 100644 MoodleClasses/MoodleCourseSection.py create mode 100644 MoodleClasses/MoodleFile.py create mode 100644 MoodleClasses/MoodleFolder.py create mode 100644 MoodleClasses/MoodleResource.py delete mode 100644 MoodleCourse.py diff --git a/MoodleClasses/MoodleActivity.py b/MoodleClasses/MoodleActivity.py new file mode 100644 index 0000000..015ce88 --- /dev/null +++ b/MoodleClasses/MoodleActivity.py @@ -0,0 +1,32 @@ +from urllib.parse import unquote +import bs4 +import os + + +class MoodleActivity: + def __init__(self, soup: bs4.element.Tag): + self.soup = soup + self.all_urls = self._get_all_urls() + self.name = self._get_name() + self.parent_dir = None + + def __repr__(self): + return f"{self.parent_dir}/{self.name}" + + def _get_all_urls(self) -> list[str]: + return list(map(lambda x: x["href"], self.soup.find_all("a"))) + + def _get_name(self) -> str: + link = self.soup.find("a") + if link is None: + return self.soup.text.strip().split("\n")[0].strip() + text = link.text + toremove = link.find("span", {"class": "accesshide"}) + if toremove is None: + name = text.strip() + else: + name = text[:text.find(toremove.text)].strip() + return unquote(name)[:min(len(name), 50)] + + def set_parent_dir(self, parent_dir: os.path) -> None: + self.parent_dir = parent_dir diff --git a/MoodleClasses/MoodleAssign.py b/MoodleClasses/MoodleAssign.py new file mode 100644 index 0000000..66ea373 --- /dev/null +++ b/MoodleClasses/MoodleAssign.py @@ -0,0 +1,36 @@ +from MoodleClasses.MoodleActivity import MoodleActivity +from MoodleClasses.MoodleFile import MoodleFile +from bs4 import BeautifulSoup +from urllib.parse import unquote +import requests + + +class MoodleAssign(MoodleActivity): + def __init__(self, soup: BeautifulSoup): + super().__init__(soup) + self.url = self._get_url() + self.parent_dir = None + self.files = [] + + def _get_url(self) -> str: + url = self.soup.find("a")["href"] + if url in self.all_urls: + self.all_urls.remove(url) + return url + + def find(self, s: requests.Session) -> bool: + r = s.get(self.url) + soup = BeautifulSoup(r.text, 'html.parser') + for candidate in soup.find_all("div", {"class": "fileuploadsubmission"}): + link = candidate.find("a") + url = link["href"].split('?')[0] + filename = unquote(link.text) + self.files.append(MoodleFile(self, url)) + self.files[-1]._get_filepath(filename) + return True + + def download(self, s: requests.Session, ignore_extension: list[str]) -> bool: + result = True + for file in self.files: + result = result and file.download(s, ignore_extension) + return result diff --git a/MoodleClasses/MoodleCourse.py b/MoodleClasses/MoodleCourse.py new file mode 100644 index 0000000..9eb7b16 --- /dev/null +++ b/MoodleClasses/MoodleCourse.py @@ -0,0 +1,35 @@ +from MoodleClasses.MoodleCourseSection import MoodleCourseSection +from bs4 import BeautifulSoup +from urllib.parse import unquote +import requests +import os + + +class MoodleCourse: + def __init__(self, name_with_code: str, url: str): + self.url = url + self.code = unquote(name_with_code.split()[0]) + self.name = unquote(" ".join(name_with_code.split()[1:]).strip()) + self.soup = None + self.sections = None + + def __repr__(self): + return f"{self.code} {self.name}: {len(self.sections)} sections" + + def init(self, s: requests.Session) -> None: + self.soup = self._get_soup(s) + self.sections = self._get_sections() + + def _get_soup(self, s: requests.Session) -> BeautifulSoup: + r = s.get(self.url) + return BeautifulSoup(r.text, 'html.parser') + + def _get_sections(self) -> list[MoodleCourseSection]: + sections_raw = self.soup.find_all("li", {"class": "course-section"}) + return list(map(lambda x: MoodleCourseSection(x), sections_raw)) + + def set_parent_dir(self, parent_dir: [str, os.path], year: str = None) -> None: + if year is None: + year = self.url.split("/")[3] + for i, sec in enumerate(self.sections): + sec.set_parent_dir(os.path.join(parent_dir, f"{self.code} - {self.name}", year), i) diff --git a/MoodleClasses/MoodleCourseSection.py b/MoodleClasses/MoodleCourseSection.py new file mode 100644 index 0000000..89554e2 --- /dev/null +++ b/MoodleClasses/MoodleCourseSection.py @@ -0,0 +1,46 @@ +from urllib.parse import unquote +from MoodleClasses.MoodleActivity import MoodleActivity +from MoodleClasses.MoodleResource import MoodleResource +from MoodleClasses.MoodleFolder import MoodleFolder +from MoodleClasses.MoodleAssign import MoodleAssign +from bs4 import BeautifulSoup +from typing import Type +import bs4 +import os + + +class MoodleCourseSection: + def __init__(self, soup: bs4.element.Tag): + self.soup = soup + self.title = self._get_title() + self.activities = self._get_activities() + + def __repr__(self): + return f"{self.title}: {len(self.activities)} activities" + + def _get_title(self) -> str: + title = self.soup.find("h3", {"class": "sectionname"}).text.strip() + return unquote(title)[:min(len(title), 50)] + + def _get_activities(self) -> list[Type[MoodleActivity]]: + activities_raw = self.soup.find_all("li", {"class": "activity"}) + activities = [] + for instance, soup in map(lambda x: (self._classify_activity(x), x), activities_raw): + if instance is not None: + activities.append(instance(soup)) + return activities + + @staticmethod + def _classify_activity(soup: BeautifulSoup) -> [None, MoodleActivity]: + activitytype = soup["class"][2] + if activitytype == "resource": + return MoodleResource + if activitytype == "folder": + return MoodleFolder + if activitytype == "assign": + return MoodleAssign + return None + + def set_parent_dir(self, parent_dir: os.path, index: int) -> None: + for act in self.activities: + act.set_parent_dir(os.path.join(parent_dir, f"{index} - {self.title}")) diff --git a/MoodleClasses/MoodleFile.py b/MoodleClasses/MoodleFile.py new file mode 100644 index 0000000..2f73495 --- /dev/null +++ b/MoodleClasses/MoodleFile.py @@ -0,0 +1,56 @@ +from MoodleClasses.MoodleActivity import MoodleActivity +from urllib.parse import unquote +import pathvalidate +import requests +import os + + +class MoodleFile: + def __init__(self, parent_activity: MoodleActivity, url: str): + self.parent_activity = parent_activity + self.url = url + self.directory = self._get_directory() + self.filepath = self._get_filepath() + self.extension = str(self.filepath).split(".")[-1] + + def _get_directory(self) -> os.path: + directory = os.path.join(self.parent_activity.parent_dir, self.parent_activity.name) + return pathvalidate.sanitize_filepath(directory) + + def _get_filepath(self, filename: str=None) -> os.path: + if filename is None: + file = os.path.join(self.directory, unquote(self.url.split('/')[-1])) + else: + file = os.path.join(self.directory, filename) + return pathvalidate.sanitize_filepath(file) + + def _decide_download(self, ignore_extension: list[str]) -> bool: + if ignore_extension is not None and self.extension in ignore_extension: + return False + if os.path.exists(self.filepath): + return False + return True + + def _download(self, s: requests.Session) -> bool: + if not os.path.exists(self.directory): + os.makedirs(self.directory) + try: + with s.get(self.url, stream=True) as r: + with open(self.filepath+".part", 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + os.rename(self.filepath+".part", self.filepath) + if os.path.exists(self.filepath + ".part"): + os.remove(self.filepath + ".part") + except Exception as e: + if os.path.exists(self.filepath+".part"): + os.remove(self.filepath+".part") + return False + finally: + return True + + def download(self, s: requests.Session, ignore_extension: list[str]) -> bool: + if self._decide_download(ignore_extension): + return self._download(s) + else: + return True diff --git a/MoodleClasses/MoodleFolder.py b/MoodleClasses/MoodleFolder.py new file mode 100644 index 0000000..84906c9 --- /dev/null +++ b/MoodleClasses/MoodleFolder.py @@ -0,0 +1,47 @@ +from MoodleClasses.MoodleActivity import MoodleActivity +from MoodleClasses.MoodleFile import MoodleFile +from bs4 import BeautifulSoup +from urllib.parse import unquote +import requests + + +class MoodleFolder(MoodleActivity): + def __init__(self, soup: BeautifulSoup): + super().__init__(soup) + self.parent_dir = None + self.file = None + + def _get_url(self) -> str: + url = self.soup.find("a")["href"] + if url in self.all_urls: + self.all_urls.remove(url) + return url + + def find(self, s: requests.Session) -> bool: + try: + form = self.soup.find("form") + url = form["action"] + "?id=" + form.find("input")["value"] + if not url.find("download_folder.php") > 0: + return False + else: + self.file = MoodleFile(self, url) + except: + r = s.get(self._get_url(), allow_redirects=True) + if not r.url.find("view.php?id=") > 0: + return False + else: + url = r.url.replace("view.php", "download_folder.php") + self.file = MoodleFile(self, url) + with s.get(url, allow_redirects=True, stream=True) as rh: + if 'Content-Disposition' not in rh.headers: + return False + else: + self.file.filepath = self.file._get_filepath(filename=unquote(rh.headers['Content-Disposition'].split('\'')[-1])) + return True + + def download(self, s: requests.Session, ignore_extension: list[str]) -> bool: + return self.file.download(s, ignore_extension) + + + + diff --git a/MoodleClasses/MoodleResource.py b/MoodleClasses/MoodleResource.py new file mode 100644 index 0000000..4a6e67d --- /dev/null +++ b/MoodleClasses/MoodleResource.py @@ -0,0 +1,61 @@ +from MoodleClasses.MoodleActivity import MoodleActivity +from MoodleClasses.MoodleFile import MoodleFile +from bs4 import BeautifulSoup +import requests + + +class MoodleResource(MoodleActivity): + def __init__(self, soup: BeautifulSoup): + super().__init__(soup) + self.url = self._get_url() + self.parent_dir = None + self.file = None + + def _get_url(self) -> str: + link = self.soup.find("a") + if link is None: + return None + url = link["href"] + if url in self.all_urls: + self.all_urls.remove(url) + return url + + def find(self, s: requests.Session) -> bool: + if self.url is None: + return False + r = s.get(self.url, allow_redirects=True) + if r.url.find("view.php?id=") > 0: + try: + soup = BeautifulSoup(r.text, 'html.parser') + main_region = soup.find("section", {"id": "region-main"}) + if main_region is not None: + link = main_region.find("a") + if link is not None: + self.file = MoodleFile(self, link["href"]) + return True + else: + img = main_region.find("img") + if img is not None: + self.file = MoodleFile(self, img["src"]) + return True + else: + return False + else: + frames = soup.find_all("frame") + if len(frames) > 1: + self.file = MoodleFile(self, frames[1]["src"]) + return True + else: + return False + except Exception as e: + return False + else: + self.file = MoodleFile(self, r.url.split("?")[0]) + return True + + def download(self, s: requests.Session, ignore_extension: list[str]) -> bool: + return self.file.download(s, ignore_extension) + + + + diff --git a/MoodleCourse.py b/MoodleCourse.py deleted file mode 100644 index 72fa082..0000000 --- a/MoodleCourse.py +++ /dev/null @@ -1,151 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import bs4 -import os -from urllib.parse import unquote - - -class MoodleActivity: - def __init__(self, soup): - self.soup = soup - self.name = None - self.url = None - self.type = None - self.parent_dir = None - self.all_urls = self._get_all_urls() - if self.all_urls: - self.init() - - def __repr__(self): - return f"{self.type}: {self.name}" - - def init(self): - self.name = self._get_name() - self.type = self._get_type() - self.url = self._get_url() - - def _get_all_urls(self): - return list(map(lambda x: x["href"], self.soup.find_all("a"))) - - def _get_name(self): - link = self.soup.find("a") - text = link.text - toremove = link.find("span", {"class": "accesshide"}) - if toremove is None: - name = text.strip() - else: - name = text[:text.find(toremove.text)].strip() - return unquote(name)[:min(len(name), 50)] - - def _get_url(self): - url = self.soup.find("a")["href"] - if url in self.all_urls: - self.all_urls.remove(url) - return url - - def _get_type(self): - return self.soup["class"][2] - - def set_parent_dir(self, parent_dir: os.path): - self.parent_dir = parent_dir - - def find_file(self, s: requests.Session) -> bool: - r = s.get(self.url, allow_redirects=True) - if r.url.find("view.php?id=")>0: - try: - soup = BeautifulSoup(r.text, 'html.parser') - main_region = soup.find("section", {"id": "region-main"}) - if main_region is not None: - link = main_region.find("a") - if link is not None: - self.url = link["href"] - return True - else: - img = main_region.find("img") - if img is not None: - self.url = img["src"] - return True - else: - return False - else: - frames = soup.find_all("frame") - if len(frames) > 1: - self.url = frames[1]["src"] - return True - else: - return False - except Exception as e: - return False - else: - self.url = r.url.split("?")[0] - return True - - def find_folder(self, s: requests.Session): - try: - form = self.soup.find("form") - url = form["action"] + "?id=" + form.find("input")["value"] - if url.find("download_folder.php") > 0: - self.url = url - return True - else: - return False - except: - r = s.get(self.url, allow_redirects=True) - if r.url.find("view.php?id=") > 0: - url = r.url.replace("view.php", "download_folder.php") - self.url = url - return True - else: - return False - - -class MoodleCourseSection: - def __init__(self, soup: bs4.element.Tag): - self.soup = soup - self.title = self._get_title() - self.activities = self._get_activities() - - def __repr__(self): - return f"{self.title}: {len(self.activities)} activities" - - def _get_title(self) -> str: - title = self.soup.find("h3", {"class": "sectionname"}).text.strip() - return unquote(title)[:min(len(title), 50)] - - def _get_activities(self) -> list[MoodleActivity]: - activities_raw = self.soup.find_all("li", {"class": "activity"}) - return list(map(lambda x: MoodleActivity(x), activities_raw)) - - def set_parent_dir(self, parent_dir: os.path, index: int): - for act in self.activities: - act.set_parent_dir(os.path.join(parent_dir, f"{index} - {self.title}")) - - -class MoodleCourse: - def __init__(self, name_with_code: str, url: str): - self.url = url - self.code = unquote(name_with_code.split()[0]) - self.name = unquote(" ".join(name_with_code.split()[1:]).strip()) - self.soup = None - self.sections = None - - def __repr__(self): - return f"{self.code} {self.name}: {len(self.sections)} sections" - - def init(self, s: requests.Session): - self.soup = self._get_soup(s) - self.sections = self._get_sections() - - def _get_soup(self, s: requests.Session) -> BeautifulSoup: - r = s.get(self.url) - return BeautifulSoup(r.text, 'html.parser') - - def _get_sections(self) -> list[MoodleCourseSection]: - sections_raw = self.soup.find_all("li", {"class": "course-section"}) - return list(map(lambda x: MoodleCourseSection(x), sections_raw)) - - def set_parent_dir(self, parent_dir, year: str = None): - if year is None: - year = self.url.split("/")[3] - for i, sec in enumerate(self.sections): - sec.set_parent_dir(os.path.join(parent_dir, f"{self.code} - {self.name}", year), i) diff --git a/main.py b/main.py index a734d6e..f82a739 100644 --- a/main.py +++ b/main.py @@ -1,9 +1,6 @@ import requests from bs4 import BeautifulSoup -from MoodleCourse import MoodleCourse -import os -import pathvalidate -from urllib.parse import unquote +from MoodleClasses.MoodleCourse import MoodleCourse def get_moodle_auth_cookie(s: requests.Session, url: str) -> requests.Session: @@ -27,84 +24,7 @@ def get_courses(s: requests.Session, url: str) -> list[MoodleCourse]: return list(map(lambda x: MoodleCourse(x[0], x[1]), zip(course_names, course_urls))) -def download_resource(s, activity): - directory = os.path.join(activity.parent_dir, activity.name) - directory = pathvalidate.sanitize_filepath(directory) - file = os.path.join(directory, unquote(activity.url.split('/')[-1])) - file = pathvalidate.sanitize_filepath(file) - if args.ignore_extension is not None and str(file).split(".")[-1] in args.ignore_extension: - return True - if os.path.exists(os.path.join(file)): - return True - if not os.path.exists(directory): - os.makedirs(directory) - try: - with s.get(activity.url, stream=True) as r: - with open(file+".part", 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) - os.rename(file+".part", file) - if os.path.exists(file + ".part"): - os.remove(file + ".part") - except: - if os.path.exists(file+".part"): - os.remove(file+".part") - return False - finally: - return True - - -def download_folder(s, activity): - directory = os.path.join(activity.parent_dir, activity.name) - directory = pathvalidate.sanitize_filepath(directory) - try: - with s.get(activity.url, stream=True) as r: - file = os.path.join(directory, unquote(r.headers['Content-Disposition'].split('\'')[-1])) - file = pathvalidate.sanitize_filepath(file) - if os.path.exists(os.path.join(file)): - return True - if not os.path.exists(directory): - os.makedirs(directory) - with open(file + ".part", 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) - os.rename(file + ".part", file) - if os.path.exists(file + ".part"): - os.remove(file + ".part") - except: - if os.path.exists(file+".part"): - os.remove(file+".part") - return False - finally: - return True - - -def download_assign(s, activity): - r = s.get(activity.url) - soup = BeautifulSoup(r.text, 'html.parser') - for candidate in soup.find_all("div", {"class": "fileuploadsubmission"}): - link = candidate.find("a") - url = link["href"].split('?')[0] - with s.get(url, stream=True) as r: - directory = os.path.join(activity.parent_dir, activity.name) - directory = pathvalidate.sanitize_filepath(directory) - file = os.path.join(directory, unquote(link.text)) - file = pathvalidate.sanitize_filepath(file) - if not os.path.exists(directory): - os.makedirs(directory) - if os.path.exists(os.path.join(file)): - return True - with open(file + ".part", 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) - os.rename(file + ".part", file) - if os.path.exists(file + ".part"): - os.remove(file + ".part") - return True - - - -def download_year(target_url: str): +def download_year(target_url: str) -> None: s = requests.Session() s = get_moodle_auth_cookie(s, target_url) courses = get_courses(s, target_url) @@ -114,20 +34,11 @@ def download_year(target_url: str): course.set_parent_dir(args.storage_dir, args.academic_year) for section in course.sections: for activity in section.activities: - if activity.type == "resource": - if activity.find_file(s): - if not download_resource(s, activity): - print("Could not download: ", activity.url) - else: - print("Could not find: ", activity.url) - elif activity.type == "folder": - if activity.find_folder(s): - if not download_folder(s, activity): - print("Could not download: ", activity.url) - else: - print("Could not find: ", activity.url) - elif activity.type == "assign": - download_assign(s, activity) + if activity.find(s): + if not activity.download(s, args.ignore_extension): + print(f"Could not download: {activity}") + else: + print(f"Could not find: {activity}") if __name__ == "__main__":