♻️ (DROP service): refactor get_titles method to handle multiple seasons and improve episode extraction logic

This commit is contained in:
Sp4rk.y 2024-09-06 18:57:23 -06:00
parent cccac84675
commit 90d66b299b

View File

@ -1,10 +1,8 @@
import re import re
from typing import Optional, Union from typing import Optional, Union
from http.cookiejar import CookieJar from http.cookiejar import CookieJar
import datetime
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import click import click
from devine.core.service import Service from devine.core.service import Service
@ -70,43 +68,57 @@ class DROP(Service):
raise ValueError("Authenticity token not found") raise ValueError("Authenticity token not found")
def get_titles(self) -> Union[Series]: def get_titles(self) -> Union[Series]:
url = f"https://www.dropout.tv/{self.title}" match = re.match(self.TITLE_RE, self.title)
if match:
title_id = match.group("id")
else:
title_id = self.title
url = f"https://www.dropout.tv/{title_id}"
response = self.session.get(url) response = self.session.get(url)
soup = BeautifulSoup(response.text, 'html.parser') soup = BeautifulSoup(response.text, 'html.parser')
episodes = []
for item in soup.find_all('div', class_='browse-item-card'): episodes = []
season_urls = []
# Extract season URLs
season_select = soup.find('select', class_='js-switch-season')
if season_select:
for option in season_select.find_all('option'):
season_urls.append(option['value'])
for season_url in season_urls:
season_response = self.session.get(season_url)
season_soup = BeautifulSoup(season_response.text, 'html.parser')
season_number = int(re.search(r'/season:(\d+)', season_url).group(1))
for item in season_soup.find_all('div', class_='browse-item-card'):
episode_link = item.find('a', class_='browse-item-link') episode_link = item.find('a', class_='browse-item-link')
if episode_link: if episode_link:
episode_url = episode_link['href'] episode_url = episode_link['href']
episode_id = episode_link['data-track-event-properties'] episode_data = json.loads(episode_link['data-track-event-properties'])
episode_id = json.loads(episode_id)['id']
title_elem = item.find('strong') episode_id = episode_data['id']
episode_title = title_elem.text.strip() if title_elem else None episode_title = episode_data['label']
episode_info = item.find('span', class_='media-identifier media-episode') episode_number_elem = item.find('span', class_='media-identifier media-episode')
if episode_info: episode_number = int(re.search(r'Episode (\d+)', episode_number_elem.text).group(1)) if episode_number_elem else None
episode_number = re.search(r'Episode (\d+)', episode_info.text)
episode_number = int(episode_number.group(1)) if episode_number else None
else:
episode_number = None
duration_elem = item.find('div', class_='duration-container') show_title = self.title.split('/')[-1].replace('-', ' ').title()
duration = duration_elem.text.strip() if duration_elem else None
episodes.append(Episode( episode = Episode(
id_=str(episode_id), id_=str(episode_id),
service=self.__class__, service=self.__class__,
title=self.title, title=show_title,
season=1, # Assuming all episodes are from season 1 season=season_number,
number=episode_number, number=episode_number,
name=episode_title, name=episode_title,
year=None, # You might want to extract this from somewhere else year=None, # You might want to extract this from somewhere else
data={'url': episode_url, 'duration': duration} data={'url': episode_url}
)) )
episodes.append(episode)
if episodes:
return Series(episodes) return Series(episodes)
def get_tracks(self, title: Union[Episode]) -> Tracks: def get_tracks(self, title: Union[Episode]) -> Tracks:
@ -130,86 +142,6 @@ class DROP(Service):
return tracks return tracks
def _extract_vimeo_data(self, webpage):
vimeo_config = self._search_regex(
r"playerConfig\s*=\s*({.+?})\s*;", webpage, "vimeo player config", default=None
)
if vimeo_config:
return json.loads(vimeo_config)
return None
def _get_series(self, series_id: str) -> Series:
webpage = self.session.get(f"https://www.dropout.tv/{series_id}")
if not webpage.ok:
self.log.error(f"Failed to download series page: {series_id}")
return Series()
webpage_text = webpage.text
entries = []
# Find the script tag containing the series data
series_data_match = re.search(r'<script type="application/ld\+json">(.*?)</script>', webpage_text, re.DOTALL)
if series_data_match:
try:
series_data = json.loads(series_data_match.group(1))
if isinstance(series_data, list):
series_data = series_data[0] # Take the first item if it's a list
if series_data.get("@type") == "TVSeries":
for season in series_data.get("season", []):
season_number = int(season.get("seasonNumber", 0))
for episode in season.get("episode", []):
episode_url = episode.get("url")
if episode_url:
episode_id = self._match_id(episode_url)
entries.append(self._get_single_episode(series_id, season_number, episode_id))
except json.JSONDecodeError:
self.log.error(f"Failed to parse series JSON data for: {series_id}")
if not entries:
self.log.error(f"No episodes found for series: {series_id}")
return Series(entries)
def _match_id(self, url):
# Extract the episode ID from the URL
match = re.search(r'/videos/([^/]+)', url)
return match.group(1) if match else None
def _get_single_episode(self, series_id: str, season_number: int, episode_id: str) -> Episode:
url = f"https://www.dropout.tv/{series_id}/season:{season_number}/videos/{episode_id}"
webpage = self.session.get(url).text
vimeo_data = self._extract_vimeo_data(webpage)
if not vimeo_data:
self.log.error(f"Failed to extract Vimeo data for episode: {episode_id}")
return None
video_data = vimeo_data.get("video", {})
title = video_data.get("title")
description = video_data.get("description")
duration = int(video_data.get("duration", 0))
release_date = video_data.get("release_date")
if release_date:
release_date = datetime.datetime.strptime(release_date, "%Y-%m-%d").date()
episode = Episode(
id_=episode_id,
service=self.__class__,
title=title,
description=description,
season=season_number,
number=int(video_data.get("episode_number", 0)),
year=release_date.year if release_date else None,
duration=duration,
data={
"vimeo_id": video_data.get("id"),
"embed_url": vimeo_data.get("embed", {}).get("html"),
},
)
return episode
def get_chapters(self, title): def get_chapters(self, title):
# Implement if DROPOUT.tv provides chapter information # Implement if DROPOUT.tv provides chapter information
return [] return []