Changing moodle activity handling

This commit is contained in:
2024-01-27 01:02:41 +01:00
parent a738c6db39
commit 8e10323ac3
9 changed files with 320 additions and 247 deletions

View File

@@ -0,0 +1,32 @@
from urllib.parse import unquote
import bs4
import os
class MoodleActivity:
def __init__(self, soup: bs4.element.Tag):
self.soup = soup
self.all_urls = self._get_all_urls()
self.name = self._get_name()
self.parent_dir = None
def __repr__(self):
return f"{self.parent_dir}/{self.name}"
def _get_all_urls(self) -> list[str]:
return list(map(lambda x: x["href"], self.soup.find_all("a")))
def _get_name(self) -> str:
link = self.soup.find("a")
if link is None:
return self.soup.text.strip().split("\n")[0].strip()
text = link.text
toremove = link.find("span", {"class": "accesshide"})
if toremove is None:
name = text.strip()
else:
name = text[:text.find(toremove.text)].strip()
return unquote(name)[:min(len(name), 50)]
def set_parent_dir(self, parent_dir: os.path) -> None:
self.parent_dir = parent_dir

View File

@@ -0,0 +1,36 @@
from MoodleClasses.MoodleActivity import MoodleActivity
from MoodleClasses.MoodleFile import MoodleFile
from bs4 import BeautifulSoup
from urllib.parse import unquote
import requests
class MoodleAssign(MoodleActivity):
def __init__(self, soup: BeautifulSoup):
super().__init__(soup)
self.url = self._get_url()
self.parent_dir = None
self.files = []
def _get_url(self) -> str:
url = self.soup.find("a")["href"]
if url in self.all_urls:
self.all_urls.remove(url)
return url
def find(self, s: requests.Session) -> bool:
r = s.get(self.url)
soup = BeautifulSoup(r.text, 'html.parser')
for candidate in soup.find_all("div", {"class": "fileuploadsubmission"}):
link = candidate.find("a")
url = link["href"].split('?')[0]
filename = unquote(link.text)
self.files.append(MoodleFile(self, url))
self.files[-1]._get_filepath(filename)
return True
def download(self, s: requests.Session, ignore_extension: list[str]) -> bool:
result = True
for file in self.files:
result = result and file.download(s, ignore_extension)
return result

View File

@@ -0,0 +1,35 @@
from MoodleClasses.MoodleCourseSection import MoodleCourseSection
from bs4 import BeautifulSoup
from urllib.parse import unquote
import requests
import os
class MoodleCourse:
def __init__(self, name_with_code: str, url: str):
self.url = url
self.code = unquote(name_with_code.split()[0])
self.name = unquote(" ".join(name_with_code.split()[1:]).strip())
self.soup = None
self.sections = None
def __repr__(self):
return f"{self.code} {self.name}: {len(self.sections)} sections"
def init(self, s: requests.Session) -> None:
self.soup = self._get_soup(s)
self.sections = self._get_sections()
def _get_soup(self, s: requests.Session) -> BeautifulSoup:
r = s.get(self.url)
return BeautifulSoup(r.text, 'html.parser')
def _get_sections(self) -> list[MoodleCourseSection]:
sections_raw = self.soup.find_all("li", {"class": "course-section"})
return list(map(lambda x: MoodleCourseSection(x), sections_raw))
def set_parent_dir(self, parent_dir: [str, os.path], year: str = None) -> None:
if year is None:
year = self.url.split("/")[3]
for i, sec in enumerate(self.sections):
sec.set_parent_dir(os.path.join(parent_dir, f"{self.code} - {self.name}", year), i)

View File

@@ -0,0 +1,46 @@
from urllib.parse import unquote
from MoodleClasses.MoodleActivity import MoodleActivity
from MoodleClasses.MoodleResource import MoodleResource
from MoodleClasses.MoodleFolder import MoodleFolder
from MoodleClasses.MoodleAssign import MoodleAssign
from bs4 import BeautifulSoup
from typing import Type
import bs4
import os
class MoodleCourseSection:
def __init__(self, soup: bs4.element.Tag):
self.soup = soup
self.title = self._get_title()
self.activities = self._get_activities()
def __repr__(self):
return f"{self.title}: {len(self.activities)} activities"
def _get_title(self) -> str:
title = self.soup.find("h3", {"class": "sectionname"}).text.strip()
return unquote(title)[:min(len(title), 50)]
def _get_activities(self) -> list[Type[MoodleActivity]]:
activities_raw = self.soup.find_all("li", {"class": "activity"})
activities = []
for instance, soup in map(lambda x: (self._classify_activity(x), x), activities_raw):
if instance is not None:
activities.append(instance(soup))
return activities
@staticmethod
def _classify_activity(soup: BeautifulSoup) -> [None, MoodleActivity]:
activitytype = soup["class"][2]
if activitytype == "resource":
return MoodleResource
if activitytype == "folder":
return MoodleFolder
if activitytype == "assign":
return MoodleAssign
return None
def set_parent_dir(self, parent_dir: os.path, index: int) -> None:
for act in self.activities:
act.set_parent_dir(os.path.join(parent_dir, f"{index} - {self.title}"))

View File

@@ -0,0 +1,56 @@
from MoodleClasses.MoodleActivity import MoodleActivity
from urllib.parse import unquote
import pathvalidate
import requests
import os
class MoodleFile:
def __init__(self, parent_activity: MoodleActivity, url: str):
self.parent_activity = parent_activity
self.url = url
self.directory = self._get_directory()
self.filepath = self._get_filepath()
self.extension = str(self.filepath).split(".")[-1]
def _get_directory(self) -> os.path:
directory = os.path.join(self.parent_activity.parent_dir, self.parent_activity.name)
return pathvalidate.sanitize_filepath(directory)
def _get_filepath(self, filename: str=None) -> os.path:
if filename is None:
file = os.path.join(self.directory, unquote(self.url.split('/')[-1]))
else:
file = os.path.join(self.directory, filename)
return pathvalidate.sanitize_filepath(file)
def _decide_download(self, ignore_extension: list[str]) -> bool:
if ignore_extension is not None and self.extension in ignore_extension:
return False
if os.path.exists(self.filepath):
return False
return True
def _download(self, s: requests.Session) -> bool:
if not os.path.exists(self.directory):
os.makedirs(self.directory)
try:
with s.get(self.url, stream=True) as r:
with open(self.filepath+".part", 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
os.rename(self.filepath+".part", self.filepath)
if os.path.exists(self.filepath + ".part"):
os.remove(self.filepath + ".part")
except Exception as e:
if os.path.exists(self.filepath+".part"):
os.remove(self.filepath+".part")
return False
finally:
return True
def download(self, s: requests.Session, ignore_extension: list[str]) -> bool:
if self._decide_download(ignore_extension):
return self._download(s)
else:
return True

View File

@@ -0,0 +1,47 @@
from MoodleClasses.MoodleActivity import MoodleActivity
from MoodleClasses.MoodleFile import MoodleFile
from bs4 import BeautifulSoup
from urllib.parse import unquote
import requests
class MoodleFolder(MoodleActivity):
def __init__(self, soup: BeautifulSoup):
super().__init__(soup)
self.parent_dir = None
self.file = None
def _get_url(self) -> str:
url = self.soup.find("a")["href"]
if url in self.all_urls:
self.all_urls.remove(url)
return url
def find(self, s: requests.Session) -> bool:
try:
form = self.soup.find("form")
url = form["action"] + "?id=" + form.find("input")["value"]
if not url.find("download_folder.php") > 0:
return False
else:
self.file = MoodleFile(self, url)
except:
r = s.get(self._get_url(), allow_redirects=True)
if not r.url.find("view.php?id=") > 0:
return False
else:
url = r.url.replace("view.php", "download_folder.php")
self.file = MoodleFile(self, url)
with s.get(url, allow_redirects=True, stream=True) as rh:
if 'Content-Disposition' not in rh.headers:
return False
else:
self.file.filepath = self.file._get_filepath(filename=unquote(rh.headers['Content-Disposition'].split('\'')[-1]))
return True
def download(self, s: requests.Session, ignore_extension: list[str]) -> bool:
return self.file.download(s, ignore_extension)

View File

@@ -0,0 +1,61 @@
from MoodleClasses.MoodleActivity import MoodleActivity
from MoodleClasses.MoodleFile import MoodleFile
from bs4 import BeautifulSoup
import requests
class MoodleResource(MoodleActivity):
def __init__(self, soup: BeautifulSoup):
super().__init__(soup)
self.url = self._get_url()
self.parent_dir = None
self.file = None
def _get_url(self) -> str:
link = self.soup.find("a")
if link is None:
return None
url = link["href"]
if url in self.all_urls:
self.all_urls.remove(url)
return url
def find(self, s: requests.Session) -> bool:
if self.url is None:
return False
r = s.get(self.url, allow_redirects=True)
if r.url.find("view.php?id=") > 0:
try:
soup = BeautifulSoup(r.text, 'html.parser')
main_region = soup.find("section", {"id": "region-main"})
if main_region is not None:
link = main_region.find("a")
if link is not None:
self.file = MoodleFile(self, link["href"])
return True
else:
img = main_region.find("img")
if img is not None:
self.file = MoodleFile(self, img["src"])
return True
else:
return False
else:
frames = soup.find_all("frame")
if len(frames) > 1:
self.file = MoodleFile(self, frames[1]["src"])
return True
else:
return False
except Exception as e:
return False
else:
self.file = MoodleFile(self, r.url.split("?")[0])
return True
def download(self, s: requests.Session, ignore_extension: list[str]) -> bool:
return self.file.download(s, ignore_extension)

View File

@@ -1,151 +0,0 @@
import requests
from bs4 import BeautifulSoup
import bs4
import os
from urllib.parse import unquote
class MoodleActivity:
def __init__(self, soup):
self.soup = soup
self.name = None
self.url = None
self.type = None
self.parent_dir = None
self.all_urls = self._get_all_urls()
if self.all_urls:
self.init()
def __repr__(self):
return f"{self.type}: {self.name}"
def init(self):
self.name = self._get_name()
self.type = self._get_type()
self.url = self._get_url()
def _get_all_urls(self):
return list(map(lambda x: x["href"], self.soup.find_all("a")))
def _get_name(self):
link = self.soup.find("a")
text = link.text
toremove = link.find("span", {"class": "accesshide"})
if toremove is None:
name = text.strip()
else:
name = text[:text.find(toremove.text)].strip()
return unquote(name)[:min(len(name), 50)]
def _get_url(self):
url = self.soup.find("a")["href"]
if url in self.all_urls:
self.all_urls.remove(url)
return url
def _get_type(self):
return self.soup["class"][2]
def set_parent_dir(self, parent_dir: os.path):
self.parent_dir = parent_dir
def find_file(self, s: requests.Session) -> bool:
r = s.get(self.url, allow_redirects=True)
if r.url.find("view.php?id=")>0:
try:
soup = BeautifulSoup(r.text, 'html.parser')
main_region = soup.find("section", {"id": "region-main"})
if main_region is not None:
link = main_region.find("a")
if link is not None:
self.url = link["href"]
return True
else:
img = main_region.find("img")
if img is not None:
self.url = img["src"]
return True
else:
return False
else:
frames = soup.find_all("frame")
if len(frames) > 1:
self.url = frames[1]["src"]
return True
else:
return False
except Exception as e:
return False
else:
self.url = r.url.split("?")[0]
return True
def find_folder(self, s: requests.Session):
try:
form = self.soup.find("form")
url = form["action"] + "?id=" + form.find("input")["value"]
if url.find("download_folder.php") > 0:
self.url = url
return True
else:
return False
except:
r = s.get(self.url, allow_redirects=True)
if r.url.find("view.php?id=") > 0:
url = r.url.replace("view.php", "download_folder.php")
self.url = url
return True
else:
return False
class MoodleCourseSection:
def __init__(self, soup: bs4.element.Tag):
self.soup = soup
self.title = self._get_title()
self.activities = self._get_activities()
def __repr__(self):
return f"{self.title}: {len(self.activities)} activities"
def _get_title(self) -> str:
title = self.soup.find("h3", {"class": "sectionname"}).text.strip()
return unquote(title)[:min(len(title), 50)]
def _get_activities(self) -> list[MoodleActivity]:
activities_raw = self.soup.find_all("li", {"class": "activity"})
return list(map(lambda x: MoodleActivity(x), activities_raw))
def set_parent_dir(self, parent_dir: os.path, index: int):
for act in self.activities:
act.set_parent_dir(os.path.join(parent_dir, f"{index} - {self.title}"))
class MoodleCourse:
def __init__(self, name_with_code: str, url: str):
self.url = url
self.code = unquote(name_with_code.split()[0])
self.name = unquote(" ".join(name_with_code.split()[1:]).strip())
self.soup = None
self.sections = None
def __repr__(self):
return f"{self.code} {self.name}: {len(self.sections)} sections"
def init(self, s: requests.Session):
self.soup = self._get_soup(s)
self.sections = self._get_sections()
def _get_soup(self, s: requests.Session) -> BeautifulSoup:
r = s.get(self.url)
return BeautifulSoup(r.text, 'html.parser')
def _get_sections(self) -> list[MoodleCourseSection]:
sections_raw = self.soup.find_all("li", {"class": "course-section"})
return list(map(lambda x: MoodleCourseSection(x), sections_raw))
def set_parent_dir(self, parent_dir, year: str = None):
if year is None:
year = self.url.split("/")[3]
for i, sec in enumerate(self.sections):
sec.set_parent_dir(os.path.join(parent_dir, f"{self.code} - {self.name}", year), i)

103
main.py
View File

@@ -1,9 +1,6 @@
import requests
from bs4 import BeautifulSoup
from MoodleCourse import MoodleCourse
import os
import pathvalidate
from urllib.parse import unquote
from MoodleClasses.MoodleCourse import MoodleCourse
def get_moodle_auth_cookie(s: requests.Session, url: str) -> requests.Session:
@@ -27,84 +24,7 @@ def get_courses(s: requests.Session, url: str) -> list[MoodleCourse]:
return list(map(lambda x: MoodleCourse(x[0], x[1]), zip(course_names, course_urls)))
def download_resource(s, activity):
directory = os.path.join(activity.parent_dir, activity.name)
directory = pathvalidate.sanitize_filepath(directory)
file = os.path.join(directory, unquote(activity.url.split('/')[-1]))
file = pathvalidate.sanitize_filepath(file)
if args.ignore_extension is not None and str(file).split(".")[-1] in args.ignore_extension:
return True
if os.path.exists(os.path.join(file)):
return True
if not os.path.exists(directory):
os.makedirs(directory)
try:
with s.get(activity.url, stream=True) as r:
with open(file+".part", 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
os.rename(file+".part", file)
if os.path.exists(file + ".part"):
os.remove(file + ".part")
except:
if os.path.exists(file+".part"):
os.remove(file+".part")
return False
finally:
return True
def download_folder(s, activity):
directory = os.path.join(activity.parent_dir, activity.name)
directory = pathvalidate.sanitize_filepath(directory)
try:
with s.get(activity.url, stream=True) as r:
file = os.path.join(directory, unquote(r.headers['Content-Disposition'].split('\'')[-1]))
file = pathvalidate.sanitize_filepath(file)
if os.path.exists(os.path.join(file)):
return True
if not os.path.exists(directory):
os.makedirs(directory)
with open(file + ".part", 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
os.rename(file + ".part", file)
if os.path.exists(file + ".part"):
os.remove(file + ".part")
except:
if os.path.exists(file+".part"):
os.remove(file+".part")
return False
finally:
return True
def download_assign(s, activity):
r = s.get(activity.url)
soup = BeautifulSoup(r.text, 'html.parser')
for candidate in soup.find_all("div", {"class": "fileuploadsubmission"}):
link = candidate.find("a")
url = link["href"].split('?')[0]
with s.get(url, stream=True) as r:
directory = os.path.join(activity.parent_dir, activity.name)
directory = pathvalidate.sanitize_filepath(directory)
file = os.path.join(directory, unquote(link.text))
file = pathvalidate.sanitize_filepath(file)
if not os.path.exists(directory):
os.makedirs(directory)
if os.path.exists(os.path.join(file)):
return True
with open(file + ".part", 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
os.rename(file + ".part", file)
if os.path.exists(file + ".part"):
os.remove(file + ".part")
return True
def download_year(target_url: str):
def download_year(target_url: str) -> None:
s = requests.Session()
s = get_moodle_auth_cookie(s, target_url)
courses = get_courses(s, target_url)
@@ -114,20 +34,11 @@ def download_year(target_url: str):
course.set_parent_dir(args.storage_dir, args.academic_year)
for section in course.sections:
for activity in section.activities:
if activity.type == "resource":
if activity.find_file(s):
if not download_resource(s, activity):
print("Could not download: ", activity.url)
else:
print("Could not find: ", activity.url)
elif activity.type == "folder":
if activity.find_folder(s):
if not download_folder(s, activity):
print("Could not download: ", activity.url)
else:
print("Could not find: ", activity.url)
elif activity.type == "assign":
download_assign(s, activity)
if activity.find(s):
if not activity.download(s, args.ignore_extension):
print(f"Could not download: {activity}")
else:
print(f"Could not find: {activity}")
if __name__ == "__main__":