unshackle-services/KNPY/__init__.py

import base64
import json
import re
from datetime import datetime, timezone
from http.cookiejar import CookieJar
from typing import List, Optional

from collections.abc import Generator
import click
import jwt
from langcodes import Language

from unshackle.core.constants import AnyTrack
from unshackle.core.credential import Credential
from unshackle.core.manifests import DASH
from unshackle.core.search_result import SearchResult
from unshackle.core.service import Service
from unshackle.core.titles import Episode, Movie, Movies, Series, Title_T, Titles_T
from unshackle.core.tracks import Subtitle, Tracks


class KNPY(Service):
    """
    Service code for Kanopy (kanopy.com).
    Version: 1.1.0

    Auth: Cookies (kapi_token) or Credential (username + password)
    Security: FHD@L3

    Handles both Movies and Series (Playlists).
    Detects and stops for movies that require tickets.
    Caching included
    """

    TITLE_RE = r"^https?://(?:www\.)?kanopy\.com/.+/(?P<id>\d+)$"
    GEOFENCE = ()
    NO_SUBTITLES = False

    @staticmethod
    @click.command(name="KNPY", short_help="https://kanopy.com")
    @click.argument("title", type=str)
    @click.pass_context
    def cli(ctx, **kwargs):
        return KNPY(ctx, **kwargs)

    def __init__(self, ctx, title: str):
        super().__init__(ctx)
        if not self.config:
            raise ValueError("KNPY configuration not found. Ensure config.yaml exists.")

        self.cdm = ctx.obj.cdm

        match = re.match(self.TITLE_RE, title)
        if match:
            self.content_id = match.group("id")
        else:
            self.content_id = None
            self.search_query = title

        self.API_VERSION = self.config["client"]["api_version"]
        self.USER_AGENT = self.config["client"]["user_agent"]
        self.WIDEVINE_UA = self.config["client"]["widevine_ua"]

        self.session.headers.update({
            "x-version": self.API_VERSION,
            "user-agent": self.USER_AGENT
        })

        self._jwt = None
        self._visitor_id = None
        self._user_id = None
        self._domain_id = None
        self.widevine_license_url = None

    def authenticate(self, cookies: Optional[CookieJar] = None, credential: Optional[Credential] = None) -> None:
        """
        Authenticate using either cookies or credentials.

        Cookie-based auth: Requires 'kapi_token' cookie from browser.
        Credential-based auth: Requires email and password.
        """

        if cookies:
            jwt_token = None
            cookie_visitor_id = None
            cookie_uid = None

            # Extract relevant cookies
            for cookie in cookies:
                if cookie.name == "kapi_token":
                    jwt_token = cookie.value
                elif cookie.name == "visitor_id":
                    cookie_visitor_id = cookie.value
                elif cookie.name == "uid":
                    cookie_uid = cookie.value

            if jwt_token:
                self.log.info("Attempting cookie-based authentication...")
                self._jwt = jwt_token
                self.session.headers.update({"authorization": f"Bearer {self._jwt}"})

                try:
                    # Decode JWT to extract user information
                    decoded_jwt = jwt.decode(self._jwt, options={"verify_signature": False})

                    # Check if token is expired
                    exp_timestamp = decoded_jwt.get("exp")
                    if exp_timestamp and exp_timestamp < datetime.now(timezone.utc).timestamp():
                        self.log.warning("Cookie token has expired.")
                        if credential:
                            self.log.info("Falling back to credential-based authentication...")
                        else:
                            raise ValueError("Cookie token expired and no credentials provided.")
                    else:
                        # Extract user data from JWT
                        jwt_data = decoded_jwt.get("data", {})
                        self._user_id = jwt_data.get("uid") or cookie_uid
                        self._visitor_id = jwt_data.get("visitor_id") or cookie_visitor_id

                        if not self._user_id:
                            raise ValueError("Could not extract user_id from cookie token")

                        self.log.info(f"Successfully authenticated via cookies (user_id: {self._user_id})")

                        # Fetch user library memberships to get domain_id
                        self._fetch_user_details()
                        return

                except jwt.DecodeError as e:
                    self.log.error(f"Failed to decode cookie token: {e}")
                    if credential:
                        self.log.info("Falling back to credential-based authentication...")
                    else:
                        raise ValueError(f"Invalid kapi_token cookie: {e}")
                except KeyError as e:
                    self.log.error(f"Missing expected field in cookie token: {e}")
                    if credential:
                        self.log.info("Falling back to credential-based authentication...")
                    else:
                        raise ValueError(f"Invalid kapi_token structure: {e}")
            else:
                self.log.info("No kapi_token found in cookies.")
                if not credential:
                    raise ValueError("No kapi_token cookie found and no credentials provided.")
                self.log.info("Falling back to credential-based authentication...")

        if not self._jwt:  # Only proceed if not already authenticated via cookies
            if not credential or not credential.username or not credential.password:
                raise ValueError("Kanopy requires either cookies (with kapi_token) or email/password for authentication.")

            # Check for cached credential-based token
            cache = self.cache.get("auth_token")

            if cache and not cache.expired:
                cached_data = cache.data
                valid_token = None

                if isinstance(cached_data, dict) and "token" in cached_data:
                    if cached_data.get("username") == credential.username:
                        valid_token = cached_data["token"]
                        self.log.info("Using cached authentication token")
                    else:
                        self.log.info(f"Cached token belongs to '{cached_data.get('username')}', but logging in as '{credential.username}'. Re-authenticating.")

                elif isinstance(cached_data, str):
                    self.log.info("Found legacy cached token format. Re-authenticating to ensure correct user.")

                if valid_token:
                    self._jwt = valid_token
                    self.session.headers.update({"authorization": f"Bearer {self._jwt}"})

                    if not self._user_id or not self._domain_id or not self._visitor_id:
                        try:
                            decoded_jwt = jwt.decode(self._jwt, options={"verify_signature": False})
                            self._user_id = decoded_jwt["data"]["uid"]
                            self._visitor_id = decoded_jwt["data"]["visitor_id"]
                            self.log.info(f"Extracted user_id and visitor_id from cached token.")
                            self._fetch_user_details()
                            return
                        except (KeyError, jwt.DecodeError) as e:
                            self.log.error(f"Could not decode cached token: {e}. Re-authenticating.")

            # Perform fresh login with credentials
            self.log.info("Performing handshake to get visitor token...")
            r = self.session.get(self.config["endpoints"]["handshake"])
            r.raise_for_status()
            handshake_data = r.json()
            self._visitor_id = handshake_data["visitorId"]
            initial_jwt = handshake_data["jwt"]

            self.log.info(f"Logging in as {credential.username}...")
            login_payload = {
                "credentialType": "email",
                "emailUser": {
                    "email": credential.username,
                    "password": credential.password
                }
            }
            r = self.session.post(
                self.config["endpoints"]["login"],
                json=login_payload,
                headers={"authorization": f"Bearer {initial_jwt}"}
            )
            r.raise_for_status()
            login_data = r.json()
            self._jwt = login_data["jwt"]
            self._user_id = login_data["userId"]

            self.session.headers.update({"authorization": f"Bearer {self._jwt}"})
            self.log.info(f"Successfully authenticated as {credential.username}")

            self._fetch_user_details()

            # Cache the token
            try:
                decoded_jwt = jwt.decode(self._jwt, options={"verify_signature": False})
                exp_timestamp = decoded_jwt.get("exp")

                cache_payload = {
                    "token": self._jwt,
                    "username": credential.username
                }

                if exp_timestamp:
                    expiration_in_seconds = int(exp_timestamp - datetime.now(timezone.utc).timestamp())
                    self.log.info(f"Caching token for {expiration_in_seconds / 60:.2f} minutes.")
                    cache.set(data=cache_payload, expiration=expiration_in_seconds)
                else:
                    self.log.warning("JWT has no 'exp' claim, caching for 1 hour as a fallback.")
                    cache.set(data=cache_payload, expiration=3600)
            except Exception as e:
                self.log.error(f"Failed to decode JWT for caching: {e}. Caching for 1 hour as a fallback.")
                cache.set(
                    data={"token": self._jwt, "username": credential.username},
                    expiration=3600
                )

    def _fetch_user_details(self):
        """Fetch user library memberships to determine the active domain_id."""
        self.log.info("Fetching user library memberships...")
        r = self.session.get(self.config["endpoints"]["memberships"].format(user_id=self._user_id))
        r.raise_for_status()
        memberships = r.json()

        # Look for the default active membership
        for membership in memberships.get("list", []):
            if membership.get("status") == "active" and membership.get("isDefault", False):
                self._domain_id = str(membership["domainId"])
                self.log.info(f"Using default library domain: {membership.get('sitename', 'Unknown')} (ID: {self._domain_id})")
                return

        # Fallback to first active membership
        for membership in memberships.get("list", []):
            if membership.get("status") == "active":
                self._domain_id = str(membership["domainId"])
                self.log.warning(f"No default library found. Using first active domain: {self._domain_id}")
                return

        if memberships.get("list"):
            self._domain_id = str(memberships["list"][0]["domainId"])
            self.log.warning(f"No active library found. Using first available domain: {self._domain_id}")
        else:
            raise ValueError("No library memberships found for this user.")

    def get_titles(self) -> Titles_T:
        if not self.content_id:
            raise ValueError("A content ID is required to get titles. Use a URL or run a search first.")
        if not self._domain_id:
            raise ValueError("Domain ID not set. Authentication may have failed.")

        r = self.session.get(self.config["endpoints"]["video_info"].format(video_id=self.content_id, domain_id=self._domain_id))
        r.raise_for_status()
        content_data = r.json()

        content_type = content_data.get("type")

        def parse_lang(data):
            try:
                langs = data.get("languages", [])
                if langs and isinstance(langs, list) and len(langs) > 0:
                    return Language.find(langs[0])
            except:
                pass
            return Language.get("en")

        if content_type == "video":
            video_data = content_data["video"]
            movie = Movie(
                id_=str(video_data["videoId"]),
                service=self.__class__,
                name=video_data["title"],
                year=video_data.get("productionYear"),
                description=video_data.get("descriptionHtml", ""),
                language=parse_lang(video_data),
                data=video_data,
            )
            return Movies([movie])

        elif content_type == "playlist":
            playlist_data = content_data["playlist"]
            series_title = playlist_data["title"]
            series_year = playlist_data.get("productionYear")

            season_match = re.search(r'(?:Season|S)\s*(\d+)', series_title, re.IGNORECASE)
            season_num = int(season_match.group(1)) if season_match else 1

            r = self.session.get(self.config["endpoints"]["video_items"].format(video_id=self.content_id, domain_id=self._domain_id))
            r.raise_for_status()
            items_data = r.json()

            episodes = []
            for i, item in enumerate(items_data.get("list", [])):
                if item.get("type") != "video":
                    continue

                video_data = item["video"]
                ep_num = i + 1

                ep_title = video_data.get("title", "")
                ep_match = re.search(r'Ep(?:isode)?\.?\s*(\d+)', ep_title, re.IGNORECASE)
                if ep_match:
                    ep_num = int(ep_match.group(1))

                episodes.append(
                    Episode(
                        id_=str(video_data["videoId"]),
                        service=self.__class__,
                        title=series_title,
                        season=season_num,
                        number=ep_num,
                        name=video_data["title"],
                        description=video_data.get("descriptionHtml", ""),
                        year=video_data.get("productionYear", series_year),
                        language=parse_lang(video_data),
                        data=video_data,
                    )
                )

            series = Series(episodes)
            series.name = series_title
            series.description = playlist_data.get("descriptionHtml", "")
            series.year = series_year
            return series

        else:
            raise ValueError(f"Unsupported content type: {content_type}")

    def get_tracks(self, title: Title_T) -> Tracks:
        play_payload = {
            "videoId": int(title.id),
            "domainId": int(self._domain_id),
            "userId": int(self._user_id),
            "visitorId": self._visitor_id
        }

        self.session.headers.setdefault("authorization", f"Bearer {self._jwt}")
        self.session.headers.setdefault("x-version", self.API_VERSION)
        self.session.headers.setdefault("user-agent", self.USER_AGENT)

        r = self.session.post(self.config["endpoints"]["plays"], json=play_payload)
        response_json = None
        try:
            response_json = r.json()
        except Exception:
            pass

        # Handle known errors gracefully
        if r.status_code == 403:
            if response_json and response_json.get("errorSubcode") == "playRegionRestricted":
                self.log.error("Kanopy reports: This video is not available in your country.")
                raise PermissionError(
                    "Playback blocked by region restriction. Try connecting through a supported country or verify your library's access region."
                )
            else:
                self.log.error(f"Access forbidden (HTTP 403). Response: {response_json}")
                raise PermissionError("Kanopy denied access to this video. It may require a different library membership or authentication.")

        r.raise_for_status()
        play_data = response_json or r.json()

        manifest_url = None
        manifest_type = None
        drm_info = {}

        # Iterate through manifests: prefer DASH, fallback to HLS
        for manifest in play_data.get("manifests", []):
            manifest_type_raw = manifest["manifestType"]
            url = manifest["url"].strip()  # Strip whitespace from URLs

            # Construct full URL if relative
            if url.startswith("/"):
                url = f"https://kanopy.com{url}"

            drm_type = manifest.get("drmType")

            if manifest_type_raw == "dash":
                manifest_url = url
                manifest_type = "dash"

                if drm_type == "kanopyDrm":
                    play_id = play_data.get("playId")
                    self.widevine_license_url = self.config["endpoints"]["widevine_license"].format(
                        license_id=f"{play_id}-0"
                    )
                elif drm_type == "studioDrm":
                    license_id = manifest.get("drmLicenseID", f"{play_data.get('playId')}-1")
                    self.widevine_license_url = self.config["endpoints"]["widevine_license"].format(
                        license_id=license_id
                    )
                else:
                    self.log.warning(f"Unknown DASH drmType: {drm_type}")
                    self.widevine_license_url = None
                break  # Prefer DASH, exit loop

            elif manifest_type_raw == "hls" and not manifest_url:
                # Store HLS as fallback if DASH not found
                manifest_url = url
                manifest_type = "hls"

                if drm_type == "fairplay":
                    self.log.warning("HLS with FairPlay DRM detected - not currently supported by this service")
                    self.widevine_license_url = None
                    drm_info["fairplay"] = True
                else:
                    # HLS with no DRM or unsupported DRM type
                    self.widevine_license_url = None
                    drm_info["clear"] = True

        if not manifest_url:
            raise ValueError("Could not find a DASH or HLS manifest for this title.")
        if manifest_type == "dash" and not self.widevine_license_url:
            raise ValueError("Could not construct Widevine license URL for DASH manifest.")

        self.log.info(f"Fetching {manifest_type.upper()} manifest from: {manifest_url}")
        r = self.session.get(manifest_url)
        r.raise_for_status()

        # Refresh headers for manifest parsing
        self.session.headers.clear()
        self.session.headers.update({
            "User-Agent": self.WIDEVINE_UA,
            "Accept": "*/*",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
        })

        # Parse manifest based on type
        if manifest_type == "dash":
            tracks = DASH.from_text(r.text, url=manifest_url).to_tracks(language=title.language)
        elif manifest_type == "hls":
            # Try to import HLS parser from unshackle
            try:
                from unshackle.core.manifests import HLS
                tracks = HLS.from_text(r.text, url=manifest_url).to_tracks(language=title.language)
                self.log.info("Successfully parsed HLS manifest")
            except ImportError:
                self.log.error(
                    "HLS manifest parser not available in unshackle.core.manifests. "
                    "Ensure your unshackle installation supports HLS parsing."
                )
                raise
            except Exception as e:
                self.log.error(f"Failed to parse HLS manifest: {e}")
                raise
        else:
            raise ValueError(f"Unsupported manifest type: {manifest_type}")

        # Add subtitles/captions from play_data (works for both DASH and HLS)
        for caption_data in play_data.get("captions", []):
            lang = caption_data.get("language", "en")
            for file_info in caption_data.get("files", []):
                if file_info.get("type") == "webvtt":
                    tracks.add(Subtitle(
                        id_=f"caption-{lang}",
                        url=file_info["url"].strip(),
                        codec=Subtitle.Codec.WebVTT,
                        language=Language.get(lang)
                    ))
                    break

        return tracks


    def get_widevine_license(self, *, challenge: bytes, title: Title_T, track: AnyTrack) -> bytes:
        if not self.widevine_license_url:
            raise ValueError("Widevine license URL was not set. Call get_tracks first.")

        license_headers = {
            "Content-Type": "application/octet-stream",
            "User-Agent": self.WIDEVINE_UA,
            "Authorization": f"Bearer {self._jwt}",
            "X-Version": self.API_VERSION
        }

        r = self.session.post(
            self.widevine_license_url,
            data=challenge,
            headers=license_headers
        )
        r.raise_for_status()
        return r.content

    def search(self) -> Generator[SearchResult, None, None]:
        if not hasattr(self, 'search_query') or not self.search_query:
            self.log.error("Search query not set. Cannot search.")
            return

        self.log.info(f"Searching for '{self.search_query}'...")

        # Ensure we have a domain ID (Library ID) before searching
        if not self._domain_id:
            self._fetch_user_details()

        params = {
            "query": self.search_query,
            "sort": "relevance",
            "domainId": self._domain_id,
            "isKids": "false",
            "page": 0,
            "perPage": 40
        }

        r = self.session.get(self.config["endpoints"]["search"], params=params)
        r.raise_for_status()
        search_data = r.json()

        # The API returns results in a "list" key
        results_list = search_data.get("list", [])

        if not results_list:
            self.log.warning(f"No results found for '{self.search_query}'")
            return

        for item in results_list:
            # Kanopy search results use 'videoId' as the unique identifier
            video_id = item.get("videoId")
            if not video_id:
                continue

            title = item.get("title", "Unknown Title")

            yield SearchResult(
                id_=str(video_id),
                title=title,
                label="VIDEO/SERIES",
                url=f"https://www.kanopy.com/video/{video_id}"
            )

    def get_chapters(self, title: Title_T) -> list:
        return []