First commit

Script is functional for ressources and folders.
2024-01-19 19:04:20 +00:00
commit 3dc76ea34c
4 changed files with 311 additions and 0 deletions
--- a/MoodleCourse.py
+++ b/MoodleCourse.py
@@ -0,0 +1,151 @@
 import requests
 from bs4 import BeautifulSoup
 import bs4
 import os
 from urllib.parse import unquote
 class MoodleActivity:
    def __init__(self, soup):
        self.soup = soup
        self.name = None
        self.url = None
        self.type = None
        self.parent_dir = None
        self.all_urls = self._get_all_urls()
        if self.all_urls:
            self.init()
    def __repr__(self):
        return f"{self.type}: {self.name}"
    def init(self):
        self.name = self._get_name()
        self.type = self._get_type()
        self.url = self._get_url()
    def _get_all_urls(self):
        return list(map(lambda x: x["href"], self.soup.find_all("a")))
    def _get_name(self):
        link = self.soup.find("a")
        text = link.text
        toremove = link.find("span", {"class": "accesshide"})
        if toremove is None:
            name = text.strip()
        else:
            name = text[:text.find(toremove.text)].strip()
        return unquote(name)[:min(len(name), 50)]
    def _get_url(self):
        url = self.soup.find("a")["href"]
        if url in self.all_urls:
            self.all_urls.remove(url)
        return url
    def _get_type(self):
        return self.soup["class"][2]
    def set_parent_dir(self, parent_dir: os.path):
        self.parent_dir = parent_dir
    def find_file(self, s: requests.Session) -> bool:
        r = s.get(self.url, allow_redirects=True)
        if r.url.find("view.php?id=")>0:
            try:
                soup = BeautifulSoup(r.text, 'html.parser')
                main_region = soup.find("section", {"id": "region-main"})
                if main_region is not None:
                    link = main_region.find("a")
                    if link is not None:
                        self.url = link["href"]
                        return True
                    else:
                        img = main_region.find("img")
                        if img is not None:
                            self.url = img["src"]
                            return True
                        else:
                            return False
                else:
                    frames = soup.find_all("frame")
                    if len(frames) > 1:
                        self.url = frames[1]["src"]
                        return True
                    else:
                        return False
            except Exception as e:
                return False
        else:
            self.url = r.url.split("?")[0]
            return True
    def find_folder(self, s: requests.Session):
        try:
            form = self.soup.find("form")
            url = form["action"] + "?id=" + form.find("input")["value"]
            if url.find("download_folder.php") > 0:
                self.url = url
                return True
            else:
                return False
        except:
            r = s.get(self.url, allow_redirects=True)
            if r.url.find("view.php?id=") > 0:
                url = r.url.replace("view.php", "download_folder.php")
                self.url = url
                return True
            else:
                return False
 class MoodleCourseSection:
    def __init__(self, soup: bs4.element.Tag):
        self.soup = soup
        self.title = self._get_title()
        self.activities = self._get_activities()
    def __repr__(self):
        return f"{self.title}: {len(self.activities)} activities"
    def _get_title(self) -> str:
        title = self.soup.find("h3", {"class": "sectionname"}).text.strip()
        return unquote(title)[:min(len(title), 50)]
    def _get_activities(self) -> list[MoodleActivity]:
        activities_raw = self.soup.find_all("li", {"class": "activity"})
        return list(map(lambda x: MoodleActivity(x), activities_raw))
    def set_parent_dir(self, parent_dir: os.path, index: int):
        for act in self.activities:
            act.set_parent_dir(os.path.join(parent_dir, f"{index} - {self.title}"))
 class MoodleCourse:
    def __init__(self, name_with_code: str, url: str):
        self.url = url
        self.code = unquote(name_with_code.split()[0])
        self.name = unquote(" ".join(name_with_code.split()[1:]).strip())
        self.soup = None
        self.sections = None
    def __repr__(self):
        return f"{self.code} {self.name}: {len(self.sections)} sections"
    def init(self, s: requests.Session):
        self.soup = self._get_soup(s)
        self.sections = self._get_sections()
    def _get_soup(self, s: requests.Session) -> BeautifulSoup:
        r = s.get(self.url)
        return BeautifulSoup(r.text, 'html.parser')
    def _get_sections(self) -> list[MoodleCourseSection]:
        sections_raw = self.soup.find_all("li", {"class": "course-section"})
        return list(map(lambda x: MoodleCourseSection(x), sections_raw))
    def set_parent_dir(self, parent_dir, year: str = None):
        if year is None:
            year = self.url.split("/")[3]
        for i, sec in enumerate(self.sections):
            sec.set_parent_dir(os.path.join(parent_dir, f"{self.code} - {self.name}", year), i)
--- a/README.md
+++ b/README.md
@@ -0,0 +1,42 @@
 # MoodleScraper
 A simple tool to download as many files from moodle as possible.
 This was designed for me to easily build an archive of my courses at EPFL. It is still in development and was not built for other moodles.
 ## Requirements
 This script was written in python 3.10.4 and the following packages:
 | package        | version |
 |----------------|---------|
 | beautifulsoup4 | 4.12.2  |
 | requests       | 2.31.0  |
 ## Features
 This script should be able to download all file directly linked to in moodle ressources and folders. It is not able to download files from secondary sources.
 ## Usage
 The script is pretty simple to use. Just provide a moodle link and authentication information like this:
 ```
 python main.py -u USER -p PASSWORD -d /moodle -m https://moodlearchive.epfl.ch/2019-2020/
 ```
 If providing an url that doesn't end with the year, you must specify the year like this:
 ```
 python main.py -u USER -p PASSWORD -d /moodle -m https://moodle.epfl.ch/ -y 2023-2024
 ```
 For speed or storage reasons you can tell the script to not download a specific kind of file:
 ```
 python main.py -u USER -p PASSWORD -d /moodle -m https://moodlearchive.epfl.ch/2019-2020/ -i mp4 -i mov
 ```
 | argument           | short | example                 |
 |--------------------|-------|-------------------------|
 | --username         | -u    | username                |
 | --password         | -p    | superstrongpassword     |
 | --moodle_url       | -m    | https://moodle.epfl.ch/ |
 | --academic_year    | -y    | 2023-2024               |
 | --storage_dir      | -d    | /moodle                 |
 | --ignore_extension | -i    | mp4                     |
 ## Planned
 - [ ] Download submission files
 - [ ] Create url files for secondary urls
 - 
--- a/main.py
+++ b/main.py
@@ -0,0 +1,118 @@
 import requests
 from bs4 import BeautifulSoup
 from MoodleCourse import MoodleCourse
 import os
 import pathvalidate
 from urllib.parse import unquote
 def get_moodle_auth_cookie(s: requests.Session, url: str) -> requests.Session:
    s.get(f"{url}login/index.php")
    s.post(f"https://tequila.epfl.ch/cgi-bin/tequila/login", data={
        "requestkey": s.cookies["TequilaPHP"],
        "username": args.username,
        "password": args.password
    })
    return s
 def get_courses(s: requests.Session, url: str) -> list[MoodleCourse]:
    r = s.get(f"{url}user/profile.php?showallcourses=1")
    soup = BeautifulSoup(r.text, 'html.parser')
    course_node = soup.find_all("section", {"class": "node_category card d-inline-block w-100 mb-3"})[1]
    course_list = course_node.find_all("a")
    course_names = list(map(lambda x: x.text.strip(), course_list))
    course_list = list(map(lambda x: x["href"], course_list))
    course_urls = list(map(lambda x: f"{url}course/view.php?id={x.split('&')[-2].split('=')[1]}", course_list))
    return list(map(lambda x: MoodleCourse(x[0], x[1]), zip(course_names, course_urls)))
 def download_resource(s, activity):
    directory = os.path.join(activity.parent_dir, activity.name)
    directory = pathvalidate.sanitize_filepath(directory)
    file = os.path.join(directory, unquote(activity.url.split('/')[-1]))
    file = pathvalidate.sanitize_filepath(file)
    if args.ignore_extension is not None and str(file).split(".")[-1] in args.ignore_extension:
        return True
    if os.path.exists(os.path.join(file)):
        return True
    if not os.path.exists(directory):
        os.makedirs(directory)
    try:
        with s.get(activity.url, stream=True) as r:
            with open(file+".part", 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        os.rename(file+".part", file)
        if os.path.exists(file + ".part"):
            os.remove(file + ".part")
    except:
        if os.path.exists(file+".part"):
            os.remove(file+".part")
        return False
    finally:
        return True
 def download_folder(s, activity):
    directory = os.path.join(activity.parent_dir, activity.name)
    directory = pathvalidate.sanitize_filepath(directory)
    try:
        with s.get(activity.url, stream=True) as r:
            file = os.path.join(directory, unquote(r.headers['Content-Disposition'].split('\'')[-1]))
            file = pathvalidate.sanitize_filepath(file)
            if os.path.exists(os.path.join(file)):
                return True
            if not os.path.exists(directory):
                os.makedirs(directory)
            with open(file + ".part", 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
            os.rename(file + ".part", file)
            if os.path.exists(file + ".part"):
                os.remove(file + ".part")
    except:
        if os.path.exists(file+".part"):
            os.remove(file+".part")
        return False
    finally:
        return True
 def download_year(target_url: str):
    s = requests.Session()
    s = get_moodle_auth_cookie(s, target_url)
    courses = get_courses(s, target_url)
    for course in courses:
        course.init(s)
        print(target_url, course.name)
        course.set_parent_dir(args.storage_dir, args.academic_year)
        for section in course.sections:
            for activity in section.activities:
                if activity.type == "resource":
                    if activity.find_file(s):
                        if not download_resource(s, activity):
                            print("Could not download: ", activity.url)
                    else:
                        print("Could not find: ", activity.url)
                elif activity.type == "folder":
                    if activity.find_folder(s):
                        if not download_folder(s, activity):
                            print("Could not download: ", activity.url)
                    else:
                        print("Could not find: ", activity.url)
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description='Moodle to download')
    parser.add_argument('--username', '-u')
    parser.add_argument('--password', '-p')
    parser.add_argument('--moodle_url', '--url', '-m', default='https://moodle.epfl.ch/')
    parser.add_argument('--academic_year', '--year', '-y', default=None)
    parser.add_argument('--storage_dir', '--dir', '-d', default='.')
    parser.add_argument('--ignore_extension', '-i', nargs='*')
    global args
    args = parser.parse_args()
    download_year(args.moodle_url)
--- a/requirement.txt
+++ b/requirement.txt