♻️ (DROP service): refactor get_titles method to handle multiple seasons and improve episode extraction logic
This commit is contained in:
parent
cccac84675
commit
90d66b299b
@ -1,10 +1,8 @@
|
|||||||
import re
|
import re
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
from http.cookiejar import CookieJar
|
from http.cookiejar import CookieJar
|
||||||
import datetime
|
|
||||||
import json
|
import json
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import click
|
import click
|
||||||
|
|
||||||
from devine.core.service import Service
|
from devine.core.service import Service
|
||||||
@ -70,43 +68,57 @@ class DROP(Service):
|
|||||||
raise ValueError("Authenticity token not found")
|
raise ValueError("Authenticity token not found")
|
||||||
|
|
||||||
def get_titles(self) -> Union[Series]:
|
def get_titles(self) -> Union[Series]:
|
||||||
url = f"https://www.dropout.tv/{self.title}"
|
match = re.match(self.TITLE_RE, self.title)
|
||||||
|
if match:
|
||||||
|
title_id = match.group("id")
|
||||||
|
else:
|
||||||
|
title_id = self.title
|
||||||
|
|
||||||
|
url = f"https://www.dropout.tv/{title_id}"
|
||||||
response = self.session.get(url)
|
response = self.session.get(url)
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
episodes = []
|
|
||||||
|
|
||||||
for item in soup.find_all('div', class_='browse-item-card'):
|
episodes = []
|
||||||
|
season_urls = []
|
||||||
|
|
||||||
|
# Extract season URLs
|
||||||
|
season_select = soup.find('select', class_='js-switch-season')
|
||||||
|
if season_select:
|
||||||
|
for option in season_select.find_all('option'):
|
||||||
|
season_urls.append(option['value'])
|
||||||
|
|
||||||
|
for season_url in season_urls:
|
||||||
|
season_response = self.session.get(season_url)
|
||||||
|
season_soup = BeautifulSoup(season_response.text, 'html.parser')
|
||||||
|
|
||||||
|
season_number = int(re.search(r'/season:(\d+)', season_url).group(1))
|
||||||
|
|
||||||
|
for item in season_soup.find_all('div', class_='browse-item-card'):
|
||||||
episode_link = item.find('a', class_='browse-item-link')
|
episode_link = item.find('a', class_='browse-item-link')
|
||||||
if episode_link:
|
if episode_link:
|
||||||
episode_url = episode_link['href']
|
episode_url = episode_link['href']
|
||||||
episode_id = episode_link['data-track-event-properties']
|
episode_data = json.loads(episode_link['data-track-event-properties'])
|
||||||
episode_id = json.loads(episode_id)['id']
|
|
||||||
|
|
||||||
title_elem = item.find('strong')
|
episode_id = episode_data['id']
|
||||||
episode_title = title_elem.text.strip() if title_elem else None
|
episode_title = episode_data['label']
|
||||||
|
|
||||||
episode_info = item.find('span', class_='media-identifier media-episode')
|
episode_number_elem = item.find('span', class_='media-identifier media-episode')
|
||||||
if episode_info:
|
episode_number = int(re.search(r'Episode (\d+)', episode_number_elem.text).group(1)) if episode_number_elem else None
|
||||||
episode_number = re.search(r'Episode (\d+)', episode_info.text)
|
|
||||||
episode_number = int(episode_number.group(1)) if episode_number else None
|
|
||||||
else:
|
|
||||||
episode_number = None
|
|
||||||
|
|
||||||
duration_elem = item.find('div', class_='duration-container')
|
show_title = self.title.split('/')[-1].replace('-', ' ').title()
|
||||||
duration = duration_elem.text.strip() if duration_elem else None
|
|
||||||
|
|
||||||
episodes.append(Episode(
|
episode = Episode(
|
||||||
id_=str(episode_id),
|
id_=str(episode_id),
|
||||||
service=self.__class__,
|
service=self.__class__,
|
||||||
title=self.title,
|
title=show_title,
|
||||||
season=1, # Assuming all episodes are from season 1
|
season=season_number,
|
||||||
number=episode_number,
|
number=episode_number,
|
||||||
name=episode_title,
|
name=episode_title,
|
||||||
year=None, # You might want to extract this from somewhere else
|
year=None, # You might want to extract this from somewhere else
|
||||||
data={'url': episode_url, 'duration': duration}
|
data={'url': episode_url}
|
||||||
))
|
)
|
||||||
|
episodes.append(episode)
|
||||||
|
|
||||||
if episodes:
|
|
||||||
return Series(episodes)
|
return Series(episodes)
|
||||||
|
|
||||||
def get_tracks(self, title: Union[Episode]) -> Tracks:
|
def get_tracks(self, title: Union[Episode]) -> Tracks:
|
||||||
@ -130,86 +142,6 @@ class DROP(Service):
|
|||||||
|
|
||||||
return tracks
|
return tracks
|
||||||
|
|
||||||
def _extract_vimeo_data(self, webpage):
|
|
||||||
vimeo_config = self._search_regex(
|
|
||||||
r"playerConfig\s*=\s*({.+?})\s*;", webpage, "vimeo player config", default=None
|
|
||||||
)
|
|
||||||
if vimeo_config:
|
|
||||||
return json.loads(vimeo_config)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _get_series(self, series_id: str) -> Series:
|
|
||||||
webpage = self.session.get(f"https://www.dropout.tv/{series_id}")
|
|
||||||
if not webpage.ok:
|
|
||||||
self.log.error(f"Failed to download series page: {series_id}")
|
|
||||||
return Series()
|
|
||||||
|
|
||||||
webpage_text = webpage.text
|
|
||||||
entries = []
|
|
||||||
|
|
||||||
# Find the script tag containing the series data
|
|
||||||
series_data_match = re.search(r'<script type="application/ld\+json">(.*?)</script>', webpage_text, re.DOTALL)
|
|
||||||
if series_data_match:
|
|
||||||
try:
|
|
||||||
series_data = json.loads(series_data_match.group(1))
|
|
||||||
if isinstance(series_data, list):
|
|
||||||
series_data = series_data[0] # Take the first item if it's a list
|
|
||||||
|
|
||||||
if series_data.get("@type") == "TVSeries":
|
|
||||||
for season in series_data.get("season", []):
|
|
||||||
season_number = int(season.get("seasonNumber", 0))
|
|
||||||
for episode in season.get("episode", []):
|
|
||||||
episode_url = episode.get("url")
|
|
||||||
if episode_url:
|
|
||||||
episode_id = self._match_id(episode_url)
|
|
||||||
entries.append(self._get_single_episode(series_id, season_number, episode_id))
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
self.log.error(f"Failed to parse series JSON data for: {series_id}")
|
|
||||||
|
|
||||||
if not entries:
|
|
||||||
self.log.error(f"No episodes found for series: {series_id}")
|
|
||||||
|
|
||||||
return Series(entries)
|
|
||||||
|
|
||||||
def _match_id(self, url):
|
|
||||||
# Extract the episode ID from the URL
|
|
||||||
match = re.search(r'/videos/([^/]+)', url)
|
|
||||||
return match.group(1) if match else None
|
|
||||||
|
|
||||||
def _get_single_episode(self, series_id: str, season_number: int, episode_id: str) -> Episode:
|
|
||||||
url = f"https://www.dropout.tv/{series_id}/season:{season_number}/videos/{episode_id}"
|
|
||||||
webpage = self.session.get(url).text
|
|
||||||
|
|
||||||
vimeo_data = self._extract_vimeo_data(webpage)
|
|
||||||
if not vimeo_data:
|
|
||||||
self.log.error(f"Failed to extract Vimeo data for episode: {episode_id}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
video_data = vimeo_data.get("video", {})
|
|
||||||
title = video_data.get("title")
|
|
||||||
description = video_data.get("description")
|
|
||||||
duration = int(video_data.get("duration", 0))
|
|
||||||
release_date = video_data.get("release_date")
|
|
||||||
|
|
||||||
if release_date:
|
|
||||||
release_date = datetime.datetime.strptime(release_date, "%Y-%m-%d").date()
|
|
||||||
|
|
||||||
episode = Episode(
|
|
||||||
id_=episode_id,
|
|
||||||
service=self.__class__,
|
|
||||||
title=title,
|
|
||||||
description=description,
|
|
||||||
season=season_number,
|
|
||||||
number=int(video_data.get("episode_number", 0)),
|
|
||||||
year=release_date.year if release_date else None,
|
|
||||||
duration=duration,
|
|
||||||
data={
|
|
||||||
"vimeo_id": video_data.get("id"),
|
|
||||||
"embed_url": vimeo_data.get("embed", {}).get("html"),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
return episode
|
|
||||||
|
|
||||||
def get_chapters(self, title):
|
def get_chapters(self, title):
|
||||||
# Implement if DROPOUT.tv provides chapter information
|
# Implement if DROPOUT.tv provides chapter information
|
||||||
return []
|
return []
|
||||||
|
Loading…
Reference in New Issue
Block a user