Files
MoodleScraper/MoodleCourse.py
edufour 3dc76ea34c First commit
Script is functional for ressources and folders.
2024-01-19 19:04:20 +00:00

152 lines
5.1 KiB
Python

import requests
from bs4 import BeautifulSoup
import bs4
import os
from urllib.parse import unquote
class MoodleActivity:
def __init__(self, soup):
self.soup = soup
self.name = None
self.url = None
self.type = None
self.parent_dir = None
self.all_urls = self._get_all_urls()
if self.all_urls:
self.init()
def __repr__(self):
return f"{self.type}: {self.name}"
def init(self):
self.name = self._get_name()
self.type = self._get_type()
self.url = self._get_url()
def _get_all_urls(self):
return list(map(lambda x: x["href"], self.soup.find_all("a")))
def _get_name(self):
link = self.soup.find("a")
text = link.text
toremove = link.find("span", {"class": "accesshide"})
if toremove is None:
name = text.strip()
else:
name = text[:text.find(toremove.text)].strip()
return unquote(name)[:min(len(name), 50)]
def _get_url(self):
url = self.soup.find("a")["href"]
if url in self.all_urls:
self.all_urls.remove(url)
return url
def _get_type(self):
return self.soup["class"][2]
def set_parent_dir(self, parent_dir: os.path):
self.parent_dir = parent_dir
def find_file(self, s: requests.Session) -> bool:
r = s.get(self.url, allow_redirects=True)
if r.url.find("view.php?id=")>0:
try:
soup = BeautifulSoup(r.text, 'html.parser')
main_region = soup.find("section", {"id": "region-main"})
if main_region is not None:
link = main_region.find("a")
if link is not None:
self.url = link["href"]
return True
else:
img = main_region.find("img")
if img is not None:
self.url = img["src"]
return True
else:
return False
else:
frames = soup.find_all("frame")
if len(frames) > 1:
self.url = frames[1]["src"]
return True
else:
return False
except Exception as e:
return False
else:
self.url = r.url.split("?")[0]
return True
def find_folder(self, s: requests.Session):
try:
form = self.soup.find("form")
url = form["action"] + "?id=" + form.find("input")["value"]
if url.find("download_folder.php") > 0:
self.url = url
return True
else:
return False
except:
r = s.get(self.url, allow_redirects=True)
if r.url.find("view.php?id=") > 0:
url = r.url.replace("view.php", "download_folder.php")
self.url = url
return True
else:
return False
class MoodleCourseSection:
def __init__(self, soup: bs4.element.Tag):
self.soup = soup
self.title = self._get_title()
self.activities = self._get_activities()
def __repr__(self):
return f"{self.title}: {len(self.activities)} activities"
def _get_title(self) -> str:
title = self.soup.find("h3", {"class": "sectionname"}).text.strip()
return unquote(title)[:min(len(title), 50)]
def _get_activities(self) -> list[MoodleActivity]:
activities_raw = self.soup.find_all("li", {"class": "activity"})
return list(map(lambda x: MoodleActivity(x), activities_raw))
def set_parent_dir(self, parent_dir: os.path, index: int):
for act in self.activities:
act.set_parent_dir(os.path.join(parent_dir, f"{index} - {self.title}"))
class MoodleCourse:
def __init__(self, name_with_code: str, url: str):
self.url = url
self.code = unquote(name_with_code.split()[0])
self.name = unquote(" ".join(name_with_code.split()[1:]).strip())
self.soup = None
self.sections = None
def __repr__(self):
return f"{self.code} {self.name}: {len(self.sections)} sections"
def init(self, s: requests.Session):
self.soup = self._get_soup(s)
self.sections = self._get_sections()
def _get_soup(self, s: requests.Session) -> BeautifulSoup:
r = s.get(self.url)
return BeautifulSoup(r.text, 'html.parser')
def _get_sections(self) -> list[MoodleCourseSection]:
sections_raw = self.soup.find_all("li", {"class": "course-section"})
return list(map(lambda x: MoodleCourseSection(x), sections_raw))
def set_parent_dir(self, parent_dir, year: str = None):
if year is None:
year = self.url.split("/")[3]
for i, sec in enumerate(self.sections):
sec.set_parent_dir(os.path.join(parent_dir, f"{self.code} - {self.name}", year), i)