From 3dc76ea34c5d62bb6230d6fa35e6f827d6e3da23 Mon Sep 17 00:00:00 2001
From: edufour <edouard.dufour@micro-ondes.ch>
Date: Fri, 19 Jan 2024 19:04:20 +0000
Subject: [PATCH] First commit

Script is functional for ressources and folders.
---
 MoodleCourse.py | 151 ++++++++++++++++++++++++++++++++++++++++++++++++
 README.md       |  42 ++++++++++++++
 main.py         | 118 +++++++++++++++++++++++++++++++++++++
 requirement.txt | Bin 0 -> 1592 bytes
 4 files changed, 311 insertions(+)
 create mode 100644 MoodleCourse.py
 create mode 100644 README.md
 create mode 100644 main.py
 create mode 100644 requirement.txt

diff --git a/MoodleCourse.py b/MoodleCourse.py
new file mode 100644
index 0000000..72fa082
--- /dev/null
+++ b/MoodleCourse.py
@@ -0,0 +1,151 @@
+import requests
+from bs4 import BeautifulSoup
+import bs4
+import os
+from urllib.parse import unquote
+
+
+class MoodleActivity:
+    def __init__(self, soup):
+        self.soup = soup
+        self.name = None
+        self.url = None
+        self.type = None
+        self.parent_dir = None
+        self.all_urls = self._get_all_urls()
+        if self.all_urls:
+            self.init()
+
+    def __repr__(self):
+        return f"{self.type}: {self.name}"
+
+    def init(self):
+        self.name = self._get_name()
+        self.type = self._get_type()
+        self.url = self._get_url()
+
+    def _get_all_urls(self):
+        return list(map(lambda x: x["href"], self.soup.find_all("a")))
+
+    def _get_name(self):
+        link = self.soup.find("a")
+        text = link.text
+        toremove = link.find("span", {"class": "accesshide"})
+        if toremove is None:
+            name = text.strip()
+        else:
+            name = text[:text.find(toremove.text)].strip()
+        return unquote(name)[:min(len(name), 50)]
+
+    def _get_url(self):
+        url = self.soup.find("a")["href"]
+        if url in self.all_urls:
+            self.all_urls.remove(url)
+        return url
+
+    def _get_type(self):
+        return self.soup["class"][2]
+
+    def set_parent_dir(self, parent_dir: os.path):
+        self.parent_dir = parent_dir
+
+    def find_file(self, s: requests.Session) -> bool:
+        r = s.get(self.url, allow_redirects=True)
+        if r.url.find("view.php?id=")>0:
+            try:
+                soup = BeautifulSoup(r.text, 'html.parser')
+                main_region = soup.find("section", {"id": "region-main"})
+                if main_region is not None:
+                    link = main_region.find("a")
+                    if link is not None:
+                        self.url = link["href"]
+                        return True
+                    else:
+                        img = main_region.find("img")
+                        if img is not None:
+                            self.url = img["src"]
+                            return True
+                        else:
+                            return False
+                else:
+                    frames = soup.find_all("frame")
+                    if len(frames) > 1:
+                        self.url = frames[1]["src"]
+                        return True
+                    else:
+                        return False
+            except Exception as e:
+                return False
+        else:
+            self.url = r.url.split("?")[0]
+            return True
+
+    def find_folder(self, s: requests.Session):
+        try:
+            form = self.soup.find("form")
+            url = form["action"] + "?id=" + form.find("input")["value"]
+            if url.find("download_folder.php") > 0:
+                self.url = url
+                return True
+            else:
+                return False
+        except:
+            r = s.get(self.url, allow_redirects=True)
+            if r.url.find("view.php?id=") > 0:
+                url = r.url.replace("view.php", "download_folder.php")
+                self.url = url
+                return True
+            else:
+                return False
+
+
+class MoodleCourseSection:
+    def __init__(self, soup: bs4.element.Tag):
+        self.soup = soup
+        self.title = self._get_title()
+        self.activities = self._get_activities()
+
+    def __repr__(self):
+        return f"{self.title}: {len(self.activities)} activities"
+
+    def _get_title(self) -> str:
+        title = self.soup.find("h3", {"class": "sectionname"}).text.strip()
+        return unquote(title)[:min(len(title), 50)]
+
+    def _get_activities(self) -> list[MoodleActivity]:
+        activities_raw = self.soup.find_all("li", {"class": "activity"})
+        return list(map(lambda x: MoodleActivity(x), activities_raw))
+
+    def set_parent_dir(self, parent_dir: os.path, index: int):
+        for act in self.activities:
+            act.set_parent_dir(os.path.join(parent_dir, f"{index} - {self.title}"))
+
+
+class MoodleCourse:
+    def __init__(self, name_with_code: str, url: str):
+        self.url = url
+        self.code = unquote(name_with_code.split()[0])
+        self.name = unquote(" ".join(name_with_code.split()[1:]).strip())
+        self.soup = None
+        self.sections = None
+
+    def __repr__(self):
+        return f"{self.code} {self.name}: {len(self.sections)} sections"
+
+    def init(self, s: requests.Session):
+        self.soup = self._get_soup(s)
+        self.sections = self._get_sections()
+
+    def _get_soup(self, s: requests.Session) -> BeautifulSoup:
+        r = s.get(self.url)
+        return BeautifulSoup(r.text, 'html.parser')
+
+    def _get_sections(self) -> list[MoodleCourseSection]:
+        sections_raw = self.soup.find_all("li", {"class": "course-section"})
+        return list(map(lambda x: MoodleCourseSection(x), sections_raw))
+
+    def set_parent_dir(self, parent_dir, year: str = None):
+        if year is None:
+            year = self.url.split("/")[3]
+        for i, sec in enumerate(self.sections):
+            sec.set_parent_dir(os.path.join(parent_dir, f"{self.code} - {self.name}", year), i)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..59addba
--- /dev/null
+++ b/README.md
@@ -0,0 +1,42 @@
+# MoodleScraper
+A simple tool to download as many files from moodle as possible.
+This was designed for me to easily build an archive of my courses at EPFL. It is still in development and was not built for other moodles.
+
+## Requirements
+This script was written in python 3.10.4 and the following packages:
+
+| package        | version |
+|----------------|---------|
+| beautifulsoup4 | 4.12.2  |
+| requests       | 2.31.0  |
+
+## Features
+This script should be able to download all file directly linked to in moodle ressources and folders. It is not able to download files from secondary sources.
+
+## Usage
+The script is pretty simple to use. Just provide a moodle link and authentication information like this:
+```
+python main.py -u USER -p PASSWORD -d /moodle -m https://moodlearchive.epfl.ch/2019-2020/
+```
+If providing an url that doesn't end with the year, you must specify the year like this:
+```
+python main.py -u USER -p PASSWORD -d /moodle -m https://moodle.epfl.ch/ -y 2023-2024
+```
+For speed or storage reasons you can tell the script to not download a specific kind of file:
+```
+python main.py -u USER -p PASSWORD -d /moodle -m https://moodlearchive.epfl.ch/2019-2020/ -i mp4 -i mov
+```
+
+| argument           | short | example                 |
+|--------------------|-------|-------------------------|
+| --username         | -u    | username                |
+| --password         | -p    | superstrongpassword     |
+| --moodle_url       | -m    | https://moodle.epfl.ch/ |
+| --academic_year    | -y    | 2023-2024               |
+| --storage_dir      | -d    | /moodle                 |
+| --ignore_extension | -i    | mp4                     |
+
+## Planned
+- [ ] Download submission files
+- [ ] Create url files for secondary urls
+- 
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..fb48efa
--- /dev/null
+++ b/main.py
@@ -0,0 +1,118 @@
+import requests
+from bs4 import BeautifulSoup
+from MoodleCourse import MoodleCourse
+import os
+import pathvalidate
+from urllib.parse import unquote
+
+
+def get_moodle_auth_cookie(s: requests.Session, url: str) -> requests.Session:
+    s.get(f"{url}login/index.php")
+    s.post(f"https://tequila.epfl.ch/cgi-bin/tequila/login", data={
+        "requestkey": s.cookies["TequilaPHP"],
+        "username": args.username,
+        "password": args.password
+    })
+    return s
+
+
+def get_courses(s: requests.Session, url: str) -> list[MoodleCourse]:
+    r = s.get(f"{url}user/profile.php?showallcourses=1")
+    soup = BeautifulSoup(r.text, 'html.parser')
+    course_node = soup.find_all("section", {"class": "node_category card d-inline-block w-100 mb-3"})[1]
+    course_list = course_node.find_all("a")
+    course_names = list(map(lambda x: x.text.strip(), course_list))
+    course_list = list(map(lambda x: x["href"], course_list))
+    course_urls = list(map(lambda x: f"{url}course/view.php?id={x.split('&')[-2].split('=')[1]}", course_list))
+    return list(map(lambda x: MoodleCourse(x[0], x[1]), zip(course_names, course_urls)))
+
+
+def download_resource(s, activity):
+    directory = os.path.join(activity.parent_dir, activity.name)
+    directory = pathvalidate.sanitize_filepath(directory)
+    file = os.path.join(directory, unquote(activity.url.split('/')[-1]))
+    file = pathvalidate.sanitize_filepath(file)
+    if args.ignore_extension is not None and str(file).split(".")[-1] in args.ignore_extension:
+        return True
+    if os.path.exists(os.path.join(file)):
+        return True
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    try:
+        with s.get(activity.url, stream=True) as r:
+            with open(file+".part", 'wb') as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+        os.rename(file+".part", file)
+        if os.path.exists(file + ".part"):
+            os.remove(file + ".part")
+    except:
+        if os.path.exists(file+".part"):
+            os.remove(file+".part")
+        return False
+    finally:
+        return True
+
+
+def download_folder(s, activity):
+    directory = os.path.join(activity.parent_dir, activity.name)
+    directory = pathvalidate.sanitize_filepath(directory)
+    try:
+        with s.get(activity.url, stream=True) as r:
+            file = os.path.join(directory, unquote(r.headers['Content-Disposition'].split('\'')[-1]))
+            file = pathvalidate.sanitize_filepath(file)
+            if os.path.exists(os.path.join(file)):
+                return True
+            if not os.path.exists(directory):
+                os.makedirs(directory)
+            with open(file + ".part", 'wb') as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            os.rename(file + ".part", file)
+            if os.path.exists(file + ".part"):
+                os.remove(file + ".part")
+    except:
+        if os.path.exists(file+".part"):
+            os.remove(file+".part")
+        return False
+    finally:
+        return True
+
+
+def download_year(target_url: str):
+    s = requests.Session()
+    s = get_moodle_auth_cookie(s, target_url)
+    courses = get_courses(s, target_url)
+    for course in courses:
+        course.init(s)
+        print(target_url, course.name)
+        course.set_parent_dir(args.storage_dir, args.academic_year)
+        for section in course.sections:
+            for activity in section.activities:
+                if activity.type == "resource":
+                    if activity.find_file(s):
+                        if not download_resource(s, activity):
+                            print("Could not download: ", activity.url)
+                    else:
+                        print("Could not find: ", activity.url)
+                elif activity.type == "folder":
+                    if activity.find_folder(s):
+                        if not download_folder(s, activity):
+                            print("Could not download: ", activity.url)
+                    else:
+                        print("Could not find: ", activity.url)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Moodle to download')
+    parser.add_argument('--username', '-u')
+    parser.add_argument('--password', '-p')
+    parser.add_argument('--moodle_url', '--url', '-m', default='https://moodle.epfl.ch/')
+    parser.add_argument('--academic_year', '--year', '-y', default=None)
+    parser.add_argument('--storage_dir', '--dir', '-d', default='.')
+    parser.add_argument('--ignore_extension', '-i', nargs='*')
+    global args
+    args = parser.parse_args()
+    download_year(args.moodle_url)
diff --git a/requirement.txt b/requirement.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c92b709b531e1795cc28fc2b3444b67cf9194a79
GIT binary patch
literal 1592
zcmZ{kL2uJQ5QXQA#7{{ThqS<f0}@9hD#0B&#z_oLZ0xiR{qey2W_InA3R#XF@7p(T
z-puTu-=&>&4K~_4T`OHzelP8%y|DLor`66525v?awiGTf8>_VnuCcA2t++Zpdt3XC
zwU9;Pg|l+5|HrJvHzc+u#awFtE9WWv_qD656mP4j=!)+*ld8)0ibGztxE6Mlror=Q
zwd2uaun$)5RbuRf9JQm=^Q2sUB?S?ktk(%SXss8fvz?erz3&rx?P|bgoufEW((10X
zCo9J7aIv<tR%&N##Cdeq@bNjCtl?^ftn7#Lg~Is7XVmn<3_>;H3)E`T7I7Q<rnPcn
zxbf`3SK3+2v0Cr=(IzL*d|CfStKdp8;R62#$Kt@7DZ`9IGgwg3by2(Q#CMN5iiw`v
zI}-@l3Rl<4t3%Rr?O*fWgSb0!<4vEc!M>HMmND?+St*CfbC>d(y8uLVPUK8<@`&hJ
z*S%0UOV;F^yJ@cJtynr=+v&+VT8I<gqp*|*M=|Hs$cOq}t}c`>c`;pea_u=+<(`G|
zs6b3}=)P6757L{RWkPbLLUZm{B7YJWmH({kqpOPvoYnu7n%Zw%*f<8Kn8BDP`X^9K
zR6b#FAnHk$|6&$C(PWvses+Wj8th)tn3RQkAKgiBJ)P9-@hH@J;woHSxSvlc5Vcq%
zB2HX;mofcx=~>*&cb?naapCG0d(eLJtA$2$oS~N!a(aS6H&fed7pG*ef1GzYU+NlX
z3_0~e)1zOL67ofHoKx(fv)}5DtnS3sdS7njbLgRS^ZJ9n&h;W&)}mgIVx{U=9y8{6
M>zRi)R5r5y3pz6DumAu6

literal 0
HcmV?d00001