♻️ (DROP service): refactor get_titles method to handle multiple seasons and improve episode extraction logic

This commit is contained in:
Sp4rk.y 2024-09-06 18:57:23 -06:00
parent cccac84675
commit 90d66b299b

View File

@ -1,10 +1,8 @@
import re
from typing import Optional, Union
from http.cookiejar import CookieJar
import datetime
import json
from bs4 import BeautifulSoup
import click
from devine.core.service import Service
@ -70,43 +68,57 @@ class DROP(Service):
raise ValueError("Authenticity token not found")
def get_titles(self) -> Union[Series]:
url = f"https://www.dropout.tv/{self.title}"
match = re.match(self.TITLE_RE, self.title)
if match:
title_id = match.group("id")
else:
title_id = self.title
url = f"https://www.dropout.tv/{title_id}"
response = self.session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
episodes = []
for item in soup.find_all('div', class_='browse-item-card'):
episodes = []
season_urls = []
# Extract season URLs
season_select = soup.find('select', class_='js-switch-season')
if season_select:
for option in season_select.find_all('option'):
season_urls.append(option['value'])
for season_url in season_urls:
season_response = self.session.get(season_url)
season_soup = BeautifulSoup(season_response.text, 'html.parser')
season_number = int(re.search(r'/season:(\d+)', season_url).group(1))
for item in season_soup.find_all('div', class_='browse-item-card'):
episode_link = item.find('a', class_='browse-item-link')
if episode_link:
episode_url = episode_link['href']
episode_id = episode_link['data-track-event-properties']
episode_id = json.loads(episode_id)['id']
episode_data = json.loads(episode_link['data-track-event-properties'])
title_elem = item.find('strong')
episode_title = title_elem.text.strip() if title_elem else None
episode_id = episode_data['id']
episode_title = episode_data['label']
episode_info = item.find('span', class_='media-identifier media-episode')
if episode_info:
episode_number = re.search(r'Episode (\d+)', episode_info.text)
episode_number = int(episode_number.group(1)) if episode_number else None
else:
episode_number = None
episode_number_elem = item.find('span', class_='media-identifier media-episode')
episode_number = int(re.search(r'Episode (\d+)', episode_number_elem.text).group(1)) if episode_number_elem else None
duration_elem = item.find('div', class_='duration-container')
duration = duration_elem.text.strip() if duration_elem else None
show_title = self.title.split('/')[-1].replace('-', ' ').title()
episodes.append(Episode(
episode = Episode(
id_=str(episode_id),
service=self.__class__,
title=self.title,
season=1, # Assuming all episodes are from season 1
title=show_title,
season=season_number,
number=episode_number,
name=episode_title,
year=None, # You might want to extract this from somewhere else
data={'url': episode_url, 'duration': duration}
))
data={'url': episode_url}
)
episodes.append(episode)
if episodes:
return Series(episodes)
def get_tracks(self, title: Union[Episode]) -> Tracks:
@ -130,86 +142,6 @@ class DROP(Service):
return tracks
def _extract_vimeo_data(self, webpage):
vimeo_config = self._search_regex(
r"playerConfig\s*=\s*({.+?})\s*;", webpage, "vimeo player config", default=None
)
if vimeo_config:
return json.loads(vimeo_config)
return None
def _get_series(self, series_id: str) -> Series:
webpage = self.session.get(f"https://www.dropout.tv/{series_id}")
if not webpage.ok:
self.log.error(f"Failed to download series page: {series_id}")
return Series()
webpage_text = webpage.text
entries = []
# Find the script tag containing the series data
series_data_match = re.search(r'<script type="application/ld\+json">(.*?)</script>', webpage_text, re.DOTALL)
if series_data_match:
try:
series_data = json.loads(series_data_match.group(1))
if isinstance(series_data, list):
series_data = series_data[0] # Take the first item if it's a list
if series_data.get("@type") == "TVSeries":
for season in series_data.get("season", []):
season_number = int(season.get("seasonNumber", 0))
for episode in season.get("episode", []):
episode_url = episode.get("url")
if episode_url:
episode_id = self._match_id(episode_url)
entries.append(self._get_single_episode(series_id, season_number, episode_id))
except json.JSONDecodeError:
self.log.error(f"Failed to parse series JSON data for: {series_id}")
if not entries:
self.log.error(f"No episodes found for series: {series_id}")
return Series(entries)
def _match_id(self, url):
# Extract the episode ID from the URL
match = re.search(r'/videos/([^/]+)', url)
return match.group(1) if match else None
def _get_single_episode(self, series_id: str, season_number: int, episode_id: str) -> Episode:
url = f"https://www.dropout.tv/{series_id}/season:{season_number}/videos/{episode_id}"
webpage = self.session.get(url).text
vimeo_data = self._extract_vimeo_data(webpage)
if not vimeo_data:
self.log.error(f"Failed to extract Vimeo data for episode: {episode_id}")
return None
video_data = vimeo_data.get("video", {})
title = video_data.get("title")
description = video_data.get("description")
duration = int(video_data.get("duration", 0))
release_date = video_data.get("release_date")
if release_date:
release_date = datetime.datetime.strptime(release_date, "%Y-%m-%d").date()
episode = Episode(
id_=episode_id,
service=self.__class__,
title=title,
description=description,
season=season_number,
number=int(video_data.get("episode_number", 0)),
year=release_date.year if release_date else None,
duration=duration,
data={
"vimeo_id": video_data.get("id"),
"embed_url": vimeo_data.get("embed", {}).get("html"),
},
)
return episode
def get_chapters(self, title):
# Implement if DROPOUT.tv provides chapter information
return []