♻️ (DROP service): refactor get_titles method to handle multiple seasons and improve episode extraction logic
This commit is contained in:
parent
cccac84675
commit
90d66b299b
@ -1,10 +1,8 @@
|
||||
import re
|
||||
from typing import Optional, Union
|
||||
from http.cookiejar import CookieJar
|
||||
import datetime
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import click
|
||||
|
||||
from devine.core.service import Service
|
||||
@ -70,44 +68,58 @@ class DROP(Service):
|
||||
raise ValueError("Authenticity token not found")
|
||||
|
||||
def get_titles(self) -> Union[Series]:
|
||||
url = f"https://www.dropout.tv/{self.title}"
|
||||
match = re.match(self.TITLE_RE, self.title)
|
||||
if match:
|
||||
title_id = match.group("id")
|
||||
else:
|
||||
title_id = self.title
|
||||
|
||||
url = f"https://www.dropout.tv/{title_id}"
|
||||
response = self.session.get(url)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
episodes = []
|
||||
season_urls = []
|
||||
|
||||
for item in soup.find_all('div', class_='browse-item-card'):
|
||||
episode_link = item.find('a', class_='browse-item-link')
|
||||
if episode_link:
|
||||
episode_url = episode_link['href']
|
||||
episode_id = episode_link['data-track-event-properties']
|
||||
episode_id = json.loads(episode_id)['id']
|
||||
# Extract season URLs
|
||||
season_select = soup.find('select', class_='js-switch-season')
|
||||
if season_select:
|
||||
for option in season_select.find_all('option'):
|
||||
season_urls.append(option['value'])
|
||||
|
||||
title_elem = item.find('strong')
|
||||
episode_title = title_elem.text.strip() if title_elem else None
|
||||
for season_url in season_urls:
|
||||
season_response = self.session.get(season_url)
|
||||
season_soup = BeautifulSoup(season_response.text, 'html.parser')
|
||||
|
||||
episode_info = item.find('span', class_='media-identifier media-episode')
|
||||
if episode_info:
|
||||
episode_number = re.search(r'Episode (\d+)', episode_info.text)
|
||||
episode_number = int(episode_number.group(1)) if episode_number else None
|
||||
else:
|
||||
episode_number = None
|
||||
season_number = int(re.search(r'/season:(\d+)', season_url).group(1))
|
||||
|
||||
duration_elem = item.find('div', class_='duration-container')
|
||||
duration = duration_elem.text.strip() if duration_elem else None
|
||||
for item in season_soup.find_all('div', class_='browse-item-card'):
|
||||
episode_link = item.find('a', class_='browse-item-link')
|
||||
if episode_link:
|
||||
episode_url = episode_link['href']
|
||||
episode_data = json.loads(episode_link['data-track-event-properties'])
|
||||
|
||||
episodes.append(Episode(
|
||||
id_=str(episode_id),
|
||||
service=self.__class__,
|
||||
title=self.title,
|
||||
season=1, # Assuming all episodes are from season 1
|
||||
number=episode_number,
|
||||
name=episode_title,
|
||||
year=None, # You might want to extract this from somewhere else
|
||||
data={'url': episode_url, 'duration': duration}
|
||||
))
|
||||
episode_id = episode_data['id']
|
||||
episode_title = episode_data['label']
|
||||
|
||||
if episodes:
|
||||
return Series(episodes)
|
||||
episode_number_elem = item.find('span', class_='media-identifier media-episode')
|
||||
episode_number = int(re.search(r'Episode (\d+)', episode_number_elem.text).group(1)) if episode_number_elem else None
|
||||
|
||||
show_title = self.title.split('/')[-1].replace('-', ' ').title()
|
||||
|
||||
episode = Episode(
|
||||
id_=str(episode_id),
|
||||
service=self.__class__,
|
||||
title=show_title,
|
||||
season=season_number,
|
||||
number=episode_number,
|
||||
name=episode_title,
|
||||
year=None, # You might want to extract this from somewhere else
|
||||
data={'url': episode_url}
|
||||
)
|
||||
episodes.append(episode)
|
||||
|
||||
return Series(episodes)
|
||||
|
||||
def get_tracks(self, title: Union[Episode]) -> Tracks:
|
||||
vimeo_id = title.data["vimeo_id"]
|
||||
@ -130,86 +142,6 @@ class DROP(Service):
|
||||
|
||||
return tracks
|
||||
|
||||
def _extract_vimeo_data(self, webpage):
|
||||
vimeo_config = self._search_regex(
|
||||
r"playerConfig\s*=\s*({.+?})\s*;", webpage, "vimeo player config", default=None
|
||||
)
|
||||
if vimeo_config:
|
||||
return json.loads(vimeo_config)
|
||||
return None
|
||||
|
||||
def _get_series(self, series_id: str) -> Series:
|
||||
webpage = self.session.get(f"https://www.dropout.tv/{series_id}")
|
||||
if not webpage.ok:
|
||||
self.log.error(f"Failed to download series page: {series_id}")
|
||||
return Series()
|
||||
|
||||
webpage_text = webpage.text
|
||||
entries = []
|
||||
|
||||
# Find the script tag containing the series data
|
||||
series_data_match = re.search(r'<script type="application/ld\+json">(.*?)</script>', webpage_text, re.DOTALL)
|
||||
if series_data_match:
|
||||
try:
|
||||
series_data = json.loads(series_data_match.group(1))
|
||||
if isinstance(series_data, list):
|
||||
series_data = series_data[0] # Take the first item if it's a list
|
||||
|
||||
if series_data.get("@type") == "TVSeries":
|
||||
for season in series_data.get("season", []):
|
||||
season_number = int(season.get("seasonNumber", 0))
|
||||
for episode in season.get("episode", []):
|
||||
episode_url = episode.get("url")
|
||||
if episode_url:
|
||||
episode_id = self._match_id(episode_url)
|
||||
entries.append(self._get_single_episode(series_id, season_number, episode_id))
|
||||
except json.JSONDecodeError:
|
||||
self.log.error(f"Failed to parse series JSON data for: {series_id}")
|
||||
|
||||
if not entries:
|
||||
self.log.error(f"No episodes found for series: {series_id}")
|
||||
|
||||
return Series(entries)
|
||||
|
||||
def _match_id(self, url):
|
||||
# Extract the episode ID from the URL
|
||||
match = re.search(r'/videos/([^/]+)', url)
|
||||
return match.group(1) if match else None
|
||||
|
||||
def _get_single_episode(self, series_id: str, season_number: int, episode_id: str) -> Episode:
|
||||
url = f"https://www.dropout.tv/{series_id}/season:{season_number}/videos/{episode_id}"
|
||||
webpage = self.session.get(url).text
|
||||
|
||||
vimeo_data = self._extract_vimeo_data(webpage)
|
||||
if not vimeo_data:
|
||||
self.log.error(f"Failed to extract Vimeo data for episode: {episode_id}")
|
||||
return None
|
||||
|
||||
video_data = vimeo_data.get("video", {})
|
||||
title = video_data.get("title")
|
||||
description = video_data.get("description")
|
||||
duration = int(video_data.get("duration", 0))
|
||||
release_date = video_data.get("release_date")
|
||||
|
||||
if release_date:
|
||||
release_date = datetime.datetime.strptime(release_date, "%Y-%m-%d").date()
|
||||
|
||||
episode = Episode(
|
||||
id_=episode_id,
|
||||
service=self.__class__,
|
||||
title=title,
|
||||
description=description,
|
||||
season=season_number,
|
||||
number=int(video_data.get("episode_number", 0)),
|
||||
year=release_date.year if release_date else None,
|
||||
duration=duration,
|
||||
data={
|
||||
"vimeo_id": video_data.get("id"),
|
||||
"embed_url": vimeo_data.get("embed", {}).get("html"),
|
||||
},
|
||||
)
|
||||
return episode
|
||||
|
||||
def get_chapters(self, title):
|
||||
# Implement if DROPOUT.tv provides chapter information
|
||||
return []
|
||||
|
Loading…
Reference in New Issue
Block a user