From 90d66b299ba65a462dfc083ac67b938df774acc7 Mon Sep 17 00:00:00 2001 From: "Sp4rk.y" Date: Fri, 6 Sep 2024 18:57:23 -0600 Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20(DROP=20service):=20refact?= =?UTF-8?q?or=20get=5Ftitles=20method=20to=20handle=20multiple=20seasons?= =?UTF-8?q?=20and=20improve=20episode=20extraction=20logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- services/DROP/__init__.py | 154 +++++++++++--------------------------- 1 file changed, 43 insertions(+), 111 deletions(-) diff --git a/services/DROP/__init__.py b/services/DROP/__init__.py index 60448c9..daf6c5d 100644 --- a/services/DROP/__init__.py +++ b/services/DROP/__init__.py @@ -1,10 +1,8 @@ import re from typing import Optional, Union from http.cookiejar import CookieJar -import datetime import json from bs4 import BeautifulSoup - import click from devine.core.service import Service @@ -70,44 +68,58 @@ class DROP(Service): raise ValueError("Authenticity token not found") def get_titles(self) -> Union[Series]: - url = f"https://www.dropout.tv/{self.title}" + match = re.match(self.TITLE_RE, self.title) + if match: + title_id = match.group("id") + else: + title_id = self.title + + url = f"https://www.dropout.tv/{title_id}" response = self.session.get(url) soup = BeautifulSoup(response.text, 'html.parser') + episodes = [] + season_urls = [] - for item in soup.find_all('div', class_='browse-item-card'): - episode_link = item.find('a', class_='browse-item-link') - if episode_link: - episode_url = episode_link['href'] - episode_id = episode_link['data-track-event-properties'] - episode_id = json.loads(episode_id)['id'] + # Extract season URLs + season_select = soup.find('select', class_='js-switch-season') + if season_select: + for option in season_select.find_all('option'): + season_urls.append(option['value']) - title_elem = item.find('strong') - episode_title = title_elem.text.strip() if title_elem else None + for season_url in season_urls: + season_response = self.session.get(season_url) + season_soup = BeautifulSoup(season_response.text, 'html.parser') - episode_info = item.find('span', class_='media-identifier media-episode') - if episode_info: - episode_number = re.search(r'Episode (\d+)', episode_info.text) - episode_number = int(episode_number.group(1)) if episode_number else None - else: - episode_number = None + season_number = int(re.search(r'/season:(\d+)', season_url).group(1)) - duration_elem = item.find('div', class_='duration-container') - duration = duration_elem.text.strip() if duration_elem else None + for item in season_soup.find_all('div', class_='browse-item-card'): + episode_link = item.find('a', class_='browse-item-link') + if episode_link: + episode_url = episode_link['href'] + episode_data = json.loads(episode_link['data-track-event-properties']) - episodes.append(Episode( - id_=str(episode_id), - service=self.__class__, - title=self.title, - season=1, # Assuming all episodes are from season 1 - number=episode_number, - name=episode_title, - year=None, # You might want to extract this from somewhere else - data={'url': episode_url, 'duration': duration} - )) + episode_id = episode_data['id'] + episode_title = episode_data['label'] - if episodes: - return Series(episodes) + episode_number_elem = item.find('span', class_='media-identifier media-episode') + episode_number = int(re.search(r'Episode (\d+)', episode_number_elem.text).group(1)) if episode_number_elem else None + + show_title = self.title.split('/')[-1].replace('-', ' ').title() + + episode = Episode( + id_=str(episode_id), + service=self.__class__, + title=show_title, + season=season_number, + number=episode_number, + name=episode_title, + year=None, # You might want to extract this from somewhere else + data={'url': episode_url} + ) + episodes.append(episode) + + return Series(episodes) def get_tracks(self, title: Union[Episode]) -> Tracks: vimeo_id = title.data["vimeo_id"] @@ -130,86 +142,6 @@ class DROP(Service): return tracks - def _extract_vimeo_data(self, webpage): - vimeo_config = self._search_regex( - r"playerConfig\s*=\s*({.+?})\s*;", webpage, "vimeo player config", default=None - ) - if vimeo_config: - return json.loads(vimeo_config) - return None - - def _get_series(self, series_id: str) -> Series: - webpage = self.session.get(f"https://www.dropout.tv/{series_id}") - if not webpage.ok: - self.log.error(f"Failed to download series page: {series_id}") - return Series() - - webpage_text = webpage.text - entries = [] - - # Find the script tag containing the series data - series_data_match = re.search(r'', webpage_text, re.DOTALL) - if series_data_match: - try: - series_data = json.loads(series_data_match.group(1)) - if isinstance(series_data, list): - series_data = series_data[0] # Take the first item if it's a list - - if series_data.get("@type") == "TVSeries": - for season in series_data.get("season", []): - season_number = int(season.get("seasonNumber", 0)) - for episode in season.get("episode", []): - episode_url = episode.get("url") - if episode_url: - episode_id = self._match_id(episode_url) - entries.append(self._get_single_episode(series_id, season_number, episode_id)) - except json.JSONDecodeError: - self.log.error(f"Failed to parse series JSON data for: {series_id}") - - if not entries: - self.log.error(f"No episodes found for series: {series_id}") - - return Series(entries) - - def _match_id(self, url): - # Extract the episode ID from the URL - match = re.search(r'/videos/([^/]+)', url) - return match.group(1) if match else None - - def _get_single_episode(self, series_id: str, season_number: int, episode_id: str) -> Episode: - url = f"https://www.dropout.tv/{series_id}/season:{season_number}/videos/{episode_id}" - webpage = self.session.get(url).text - - vimeo_data = self._extract_vimeo_data(webpage) - if not vimeo_data: - self.log.error(f"Failed to extract Vimeo data for episode: {episode_id}") - return None - - video_data = vimeo_data.get("video", {}) - title = video_data.get("title") - description = video_data.get("description") - duration = int(video_data.get("duration", 0)) - release_date = video_data.get("release_date") - - if release_date: - release_date = datetime.datetime.strptime(release_date, "%Y-%m-%d").date() - - episode = Episode( - id_=episode_id, - service=self.__class__, - title=title, - description=description, - season=season_number, - number=int(video_data.get("episode_number", 0)), - year=release_date.year if release_date else None, - duration=duration, - data={ - "vimeo_id": video_data.get("id"), - "embed_url": vimeo_data.get("embed", {}).get("html"), - }, - ) - return episode - def get_chapters(self, title): # Implement if DROPOUT.tv provides chapter information return []