736 lines
30 KiB
Python
Raw Normal View History

2023-02-06 02:33:09 +00:00
from __future__ import annotations
import base64
import html
import logging
2023-02-06 02:33:09 +00:00
import math
import re
import sys
2023-02-06 02:33:09 +00:00
from copy import copy
from functools import partial
from pathlib import Path
from typing import Any, Callable, Optional, Union
2023-02-06 02:33:09 +00:00
from urllib.parse import urljoin, urlparse
from uuid import UUID
from zlib import crc32
2023-02-06 02:33:09 +00:00
import requests
from langcodes import Language, tag_is_valid
from lxml.etree import Element
2023-02-06 02:33:09 +00:00
from pywidevine.cdm import Cdm as WidevineCdm
from pywidevine.pssh import PSSH
from requests import Session
from devine.core.constants import DOWNLOAD_CANCELLED, DOWNLOAD_LICENCE_ONLY, AnyTrack
2023-09-19 12:23:52 +01:00
from devine.core.downloaders import downloader
from devine.core.downloaders import requests as requests_downloader
2023-02-06 02:33:09 +00:00
from devine.core.drm import Widevine
from devine.core.tracks import Audio, Subtitle, Tracks, Video
from devine.core.utilities import is_close_match, try_ensure_utf8
2023-02-06 02:33:09 +00:00
from devine.core.utils.xml import load_xml
class DASH:
def __init__(self, manifest, url: str):
if manifest is None:
raise ValueError("DASH manifest must be provided.")
if manifest.tag != "MPD":
raise TypeError(f"Expected 'MPD' document, but received a '{manifest.tag}' document instead.")
if not url:
raise requests.URLRequired("DASH manifest URL must be provided for relative path computations.")
if not isinstance(url, str):
raise TypeError(f"Expected url to be a {str}, not {url!r}")
self.manifest = manifest
self.url = url
@classmethod
def from_url(cls, url: str, session: Optional[Session] = None, **args: Any) -> DASH:
if not url:
raise requests.URLRequired("DASH manifest URL must be provided for relative path computations.")
if not isinstance(url, str):
raise TypeError(f"Expected url to be a {str}, not {url!r}")
if not session:
session = Session()
elif not isinstance(session, Session):
raise TypeError(f"Expected session to be a {Session}, not {session!r}")
res = session.get(url, **args)
if res.url != url:
url = res.url
2023-02-06 02:33:09 +00:00
if not res.ok:
raise requests.ConnectionError(
"Failed to request the MPD document.",
response=res
)
return DASH.from_text(res.text, url)
@classmethod
def from_text(cls, text: str, url: str) -> DASH:
if not text:
raise ValueError("DASH manifest Text must be provided.")
if not isinstance(text, str):
raise TypeError(f"Expected text to be a {str}, not {text!r}")
if not url:
raise requests.URLRequired("DASH manifest URL must be provided for relative path computations.")
if not isinstance(url, str):
raise TypeError(f"Expected url to be a {str}, not {url!r}")
manifest = load_xml(text)
return cls(manifest, url)
def to_tracks(
self,
language: Optional[Union[str, Language]] = None,
period_filter: Optional[Callable] = None
) -> Tracks:
2023-02-06 02:33:09 +00:00
"""
Convert an MPEG-DASH document to Video, Audio and Subtitle Track objects.
2023-02-06 02:33:09 +00:00
Parameters:
language: The Title's Original Recorded Language. It will also be used as a fallback
track language value if the manifest does not list language information.
2023-02-06 02:33:09 +00:00
period_filter: Filter out period's within the manifest.
All Track URLs will be a list of segment URLs.
"""
tracks = Tracks()
for period in self.manifest.findall("Period"):
if callable(period_filter) and period_filter(period):
continue
for adaptation_set in period.findall("AdaptationSet"):
if self.is_trick_mode(adaptation_set):
# we don't want trick mode streams (they are only used for fast-forward/rewind)
continue
2023-02-06 02:33:09 +00:00
for rep in adaptation_set.findall("Representation"):
get = partial(self._get, adaptation_set=adaptation_set, representation=rep)
findall = partial(self._findall, adaptation_set=adaptation_set, representation=rep, both=True)
2023-02-06 02:33:09 +00:00
codecs = get("codecs")
content_type = get("contentType")
mime_type = get("mimeType")
2023-02-06 02:33:09 +00:00
if not content_type and mime_type:
content_type = mime_type.split("/")[0]
if not content_type and not mime_type:
raise ValueError("Unable to determine the format of a Representation, cannot continue...")
if mime_type == "application/mp4" or content_type == "application":
# likely mp4-boxed subtitles
# TODO: It may not actually be subtitles
2023-02-06 02:33:09 +00:00
try:
real_codec = Subtitle.Codec.from_mime(codecs)
2023-02-06 02:33:09 +00:00
content_type = "text"
mime_type = f"application/mp4; codecs='{real_codec.value.lower()}'"
2023-02-06 02:33:09 +00:00
except ValueError:
raise ValueError(f"Unsupported content type '{content_type}' with codecs of '{codecs}'")
if content_type == "text" and mime_type and "/mp4" not in mime_type:
# mimeType likely specifies the subtitle codec better than `codecs`
codecs = mime_type.split("/")[1]
if content_type == "video":
track_type = Video
track_codec = Video.Codec.from_codecs(codecs)
track_args = dict(
range_=self.get_video_range(
codecs,
findall("SupplementalProperty"),
findall("EssentialProperty")
),
bitrate=get("bandwidth") or None,
width=get("width") or 0,
height=get("height") or 0,
fps=get("frameRate") or (rep.find("SegmentBase") or {}).get("timescale") or None
)
elif content_type == "audio":
track_type = Audio
track_codec = Audio.Codec.from_codecs(codecs)
track_args = dict(
bitrate=get("bandwidth") or None,
channels=next(iter(
rep.xpath("AudioChannelConfiguration/@value")
or adaptation_set.xpath("AudioChannelConfiguration/@value")
), None),
joc=self.get_ddp_complexity_index(adaptation_set, rep),
descriptive=self.is_descriptive(adaptation_set)
)
elif content_type == "text":
track_type = Subtitle
track_codec = Subtitle.Codec.from_codecs(codecs or "vtt")
track_args = dict(
cc=self.is_closed_caption(adaptation_set),
sdh=self.is_sdh(adaptation_set),
forced=self.is_forced(adaptation_set)
)
elif content_type == "image":
# we don't want what's likely thumbnails for the seekbar
continue
else:
raise ValueError(f"Unknown Track Type '{content_type}'")
track_lang = self.get_language(adaptation_set, rep, fallback=language)
2023-02-06 02:33:09 +00:00
if not track_lang:
msg = "Language information could not be derived from a Representation."
if language is None:
msg += " No fallback language was provided when calling DASH.to_tracks()."
elif not tag_is_valid((str(language) or "").strip()) or str(language).startswith("und"):
msg += f" The fallback language provided is also invalid: {language}"
raise ValueError(msg)
2023-02-06 02:33:09 +00:00
# for some reason it's incredibly common for services to not provide
# a good and actually unique track ID, sometimes because of the lang
# dialect not being represented in the id, or the bitrate, or such.
# this combines all of them as one and hashes it to keep it small(ish).
track_id = hex(crc32("{codec}-{lang}-{bitrate}-{base_url}-{ids}-{track_args}".format(
2023-02-06 02:33:09 +00:00
codec=codecs,
lang=track_lang,
bitrate=get("bitrate"),
2023-02-06 02:33:09 +00:00
base_url=(rep.findtext("BaseURL") or "").split("?")[0],
ids=[get("audioTrackId"), get("id"), period.get("id")],
track_args=track_args
).encode()))[2:]
2023-02-06 02:33:09 +00:00
tracks.add(track_type(
id_=track_id,
url=(self.url, self.manifest, rep, adaptation_set, period),
2023-02-06 02:33:09 +00:00
codec=track_codec,
language=track_lang,
is_original_lang=language and is_close_match(track_lang, [language]),
2023-02-06 02:33:09 +00:00
descriptor=Video.Descriptor.MPD,
extra=(rep, adaptation_set),
**track_args
2023-02-06 02:33:09 +00:00
))
# only get tracks from the first main-content period
break
return tracks
@staticmethod
def download_track(
track: AnyTrack,
save_path: Path,
save_dir: Path,
progress: partial,
session: Optional[Session] = None,
proxy: Optional[str] = None,
license_widevine: Optional[Callable] = None
):
if not session:
session = Session()
elif not isinstance(session, Session):
raise TypeError(f"Expected session to be a {Session}, not {session!r}")
if proxy:
session.proxies.update({
"all": proxy
})
log = logging.getLogger("DASH")
manifest_url, manifest, representation, adaptation_set, period = track.url
track.drm = DASH.get_drm(
representation.findall("ContentProtection") +
adaptation_set.findall("ContentProtection")
)
manifest_url_query = urlparse(manifest_url).query
manifest_base_url = manifest.findtext("BaseURL")
if not manifest_base_url:
manifest_base_url = manifest_url
elif not re.match("^https?://", manifest_base_url, re.IGNORECASE):
manifest_base_url = urljoin(manifest_url, f"./{manifest_base_url}")
period_base_url = urljoin(manifest_base_url, period.findtext("BaseURL"))
rep_base_url = urljoin(period_base_url, representation.findtext("BaseURL"))
period_duration = period.get("duration") or manifest.get("mediaPresentationDuration")
init_data: Optional[bytes] = None
segment_template = representation.find("SegmentTemplate")
if segment_template is None:
segment_template = adaptation_set.find("SegmentTemplate")
segment_list = representation.find("SegmentList")
if segment_list is None:
segment_list = adaptation_set.find("SegmentList")
segment_base = representation.find("SegmentBase")
if segment_base is None:
segment_base = adaptation_set.find("SegmentBase")
segments: list[tuple[str, Optional[str]]] = []
track_kid: Optional[UUID] = None
if segment_template is not None:
segment_template = copy(segment_template)
start_number = int(segment_template.get("startNumber") or 1)
segment_timeline = segment_template.find("SegmentTimeline")
for item in ("initialization", "media"):
value = segment_template.get(item)
if not value:
continue
if not re.match("^https?://", value, re.IGNORECASE):
if not rep_base_url:
raise ValueError("Resolved Segment URL is not absolute, and no Base URL is available.")
value = urljoin(rep_base_url, value)
if not urlparse(value).query and manifest_url_query:
value += f"?{manifest_url_query}"
segment_template.set(item, value)
init_url = segment_template.get("initialization")
if init_url:
res = session.get(DASH.replace_fields(
init_url,
Bandwidth=representation.get("bandwidth"),
RepresentationID=representation.get("id")
))
res.raise_for_status()
init_data = res.content
track_kid = track.get_key_id(init_data)
if segment_timeline is not None:
seg_time_list = []
current_time = 0
for s in segment_timeline.findall("S"):
if s.get("t"):
current_time = int(s.get("t"))
for _ in range(1 + (int(s.get("r") or 0))):
seg_time_list.append(current_time)
current_time += int(s.get("d"))
seg_num_list = list(range(start_number, len(seg_time_list) + start_number))
for t, n in zip(seg_time_list, seg_num_list):
segments.append((
DASH.replace_fields(
segment_template.get("media"),
Bandwidth=representation.get("bandwidth"),
Number=n,
RepresentationID=representation.get("id"),
Time=t
), None
))
else:
if not period_duration:
raise ValueError("Duration of the Period was unable to be determined.")
period_duration = DASH.pt_to_sec(period_duration)
segment_duration = float(segment_template.get("duration"))
segment_timescale = float(segment_template.get("timescale") or 1)
total_segments = math.ceil(period_duration / (segment_duration / segment_timescale))
for s in range(start_number, start_number + total_segments):
segments.append((
DASH.replace_fields(
segment_template.get("media"),
Bandwidth=representation.get("bandwidth"),
Number=s,
RepresentationID=representation.get("id"),
Time=s
), None
))
elif segment_list is not None:
init_data = None
initialization = segment_list.find("Initialization")
if initialization is not None:
source_url = initialization.get("sourceURL")
if not source_url:
source_url = rep_base_url
elif not re.match("^https?://", source_url, re.IGNORECASE):
source_url = urljoin(rep_base_url, f"./{source_url}")
if initialization.get("range"):
init_range_header = {"Range": f"bytes={initialization.get('range')}"}
else:
init_range_header = None
res = session.get(url=source_url, headers=init_range_header)
res.raise_for_status()
init_data = res.content
track_kid = track.get_key_id(init_data)
segment_urls = segment_list.findall("SegmentURL")
for segment_url in segment_urls:
media_url = segment_url.get("media")
if not media_url:
media_url = rep_base_url
elif not re.match("^https?://", media_url, re.IGNORECASE):
media_url = urljoin(rep_base_url, f"./{media_url}")
segments.append((
media_url,
segment_url.get("mediaRange")
))
elif segment_base is not None:
media_range = None
init_data = None
initialization = segment_base.find("Initialization")
if initialization is not None:
if initialization.get("range"):
init_range_header = {"Range": f"bytes={initialization.get('range')}"}
else:
init_range_header = None
res = session.get(url=rep_base_url, headers=init_range_header)
res.raise_for_status()
init_data = res.content
track_kid = track.get_key_id(init_data)
total_size = res.headers.get("Content-Range", "").split("/")[-1]
if total_size:
media_range = f"{len(init_data)}-{total_size}"
segments.append((
rep_base_url,
media_range
))
elif rep_base_url:
segments.append((
rep_base_url,
None
))
else:
log.error("Could not find a way to get segments from this MPD manifest.")
log.debug(manifest_url)
sys.exit(1)
if not track.drm and isinstance(track, (Video, Audio)):
try:
track.drm = [Widevine.from_init_data(init_data)]
except Widevine.Exceptions.PSSHNotFound:
# it might not have Widevine DRM, or might not have found the PSSH
log.warning("No Widevine PSSH was found for this track, is it DRM free?")
if track.drm:
# last chance to find the KID, assumes first segment will hold the init data
track_kid = track_kid or track.get_key_id(url=segments[0][0], session=session)
# TODO: What if we don't want to use the first DRM system?
drm = track.drm[0]
if isinstance(drm, Widevine):
# license and grab content keys
try:
if not license_widevine:
raise ValueError("license_widevine func must be supplied to use Widevine DRM")
progress(downloaded="LICENSING")
license_widevine(drm, track_kid=track_kid)
progress(downloaded="[yellow]LICENSED")
except Exception: # noqa
DOWNLOAD_CANCELLED.set() # skip pending track downloads
progress(downloaded="[red]FAILED")
raise
else:
drm = None
if DOWNLOAD_LICENCE_ONLY.is_set():
progress(downloaded="[yellow]SKIPPED")
return
progress(total=len(segments))
downloader_ = downloader
if downloader.__name__ == "aria2c" and any(bytes_range is not None for url, bytes_range in segments):
# aria2(c) is shit and doesn't support the Range header, fallback to the requests downloader
downloader_ = requests_downloader
for status_update in downloader_(
urls=[
{
"url": url,
"headers": {
"Range": f"bytes={bytes_range}"
} if bytes_range else {}
}
for url, bytes_range in segments
],
output_dir=save_dir,
filename="{i:0%d}.mp4" % (len(str(len(segments)))),
headers=session.headers,
cookies=session.cookies,
proxy=proxy,
max_workers=16
):
file_downloaded = status_update.get("file_downloaded")
if file_downloaded and callable(track.OnSegmentDownloaded):
track.OnSegmentDownloaded(file_downloaded)
else:
downloaded = status_update.get("downloaded")
if downloaded and downloaded.endswith("/s"):
status_update["downloaded"] = f"DASH {downloaded}"
progress(**status_update)
segments_to_merge = sorted(save_dir.iterdir())
progress(downloaded="Merging", completed=0, total=len(segments_to_merge))
with open(save_path, "wb") as f:
if init_data:
f.write(init_data)
for segment_file in segments_to_merge:
segment_data = segment_file.read_bytes()
# TODO: fix encoding after decryption?
if (
not drm and isinstance(track, Subtitle) and
track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML)
):
segment_data = try_ensure_utf8(segment_data)
segment_data = segment_data.decode("utf8"). \
replace("‎", html.unescape("‎")). \
replace("‏", html.unescape("‏")). \
encode("utf8")
f.write(segment_data)
f.flush()
segment_file.unlink()
progress(advance=1)
track.path = save_path
if callable(track.OnDownloaded):
track.OnDownloaded()
if drm:
progress(downloaded="Decrypting", completed=0, total=100)
drm.decrypt(save_path)
track.drm = None
if callable(track.OnDecrypted):
track.OnDecrypted(drm)
progress(downloaded="Decrypting", advance=100)
save_dir.rmdir()
progress(downloaded="Downloaded")
2023-02-06 02:33:09 +00:00
@staticmethod
def _get(
item: str,
adaptation_set: Element,
representation: Optional[Element] = None
) -> Optional[Any]:
"""Helper to get a requested item from the Representation, otherwise from the AdaptationSet."""
adaptation_set_item = adaptation_set.get(item)
if representation is None:
return adaptation_set_item
representation_item = representation.get(item)
if representation_item is not None:
return representation_item
return adaptation_set_item
@staticmethod
def _findall(
item: str,
adaptation_set: Element,
representation: Optional[Element] = None,
both: bool = False
) -> list[Any]:
"""
Helper to get all requested items from the Representation, otherwise from the AdaptationSet.
Optionally, you may pass both=True to keep both values (where available).
"""
adaptation_set_items = adaptation_set.findall(item)
if representation is None:
return adaptation_set_items
representation_items = representation.findall(item)
if both:
return representation_items + adaptation_set_items
if representation_items:
return representation_items
return adaptation_set_items
@staticmethod
def get_language(
adaptation_set: Element,
representation: Optional[Element] = None,
fallback: Optional[Union[str, Language]] = None
) -> Optional[Language]:
"""
Get Language (if any) from the AdaptationSet or Representation.
A fallback language may be provided if no language information could be
retrieved.
"""
options = []
if representation is not None:
options.append(representation.get("lang"))
# derive language from somewhat common id string format
# the format is typically "{rep_id}_{lang}={bitrate}" or similar
rep_id = representation.get("id")
if rep_id:
m = re.match(r"\w+_(\w+)=\d+", rep_id)
if m:
options.append(m.group(1))
options.append(adaptation_set.get("lang"))
if fallback:
options.append(fallback)
2023-02-06 02:33:09 +00:00
for option in options:
option = (str(option) or "").strip()
if not tag_is_valid(option) or option.startswith("und"):
continue
return Language.get(option)
@staticmethod
def get_video_range(
codecs: str,
all_supplemental_props: list[Element],
all_essential_props: list[Element]
) -> Video.Range:
if codecs.startswith(("dva1", "dvav", "dvhe", "dvh1")):
return Video.Range.DV
return Video.Range.from_cicp(
primaries=next((
int(x.get("value"))
for x in all_supplemental_props + all_essential_props
if x.get("schemeIdUri") == "urn:mpeg:mpegB:cicp:ColourPrimaries"
), 0),
transfer=next((
int(x.get("value"))
for x in all_supplemental_props + all_essential_props
if x.get("schemeIdUri") == "urn:mpeg:mpegB:cicp:TransferCharacteristics"
), 0),
matrix=next((
int(x.get("value"))
for x in all_supplemental_props + all_essential_props
if x.get("schemeIdUri") == "urn:mpeg:mpegB:cicp:MatrixCoefficients"
), 0)
)
@staticmethod
def is_trick_mode(adaptation_set: Element) -> bool:
"""Check if contents of Adaptation Set is a Trick-Mode stream."""
essential_props = adaptation_set.findall("EssentialProperty")
supplemental_props = adaptation_set.findall("SupplementalProperty")
return any(
prop.get("schemeIdUri") == "http://dashif.org/guidelines/trickmode"
for prop in essential_props + supplemental_props
)
@staticmethod
def is_descriptive(adaptation_set: Element) -> bool:
"""Check if contents of Adaptation Set is Descriptive."""
return any(
(x.get("schemeIdUri"), x.get("value")) in (
("urn:mpeg:dash:role:2011", "descriptive"),
("urn:tva:metadata:cs:AudioPurposeCS:2007", "1")
)
for x in adaptation_set.findall("Accessibility")
)
@staticmethod
def is_forced(adaptation_set: Element) -> bool:
"""Check if contents of Adaptation Set is a Forced Subtitle."""
return any(
x.get("schemeIdUri") == "urn:mpeg:dash:role:2011"
and x.get("value") in ("forced-subtitle", "forced_subtitle")
for x in adaptation_set.findall("Role")
)
@staticmethod
def is_sdh(adaptation_set: Element) -> bool:
"""Check if contents of Adaptation Set is for the Hearing Impaired."""
return any(
(x.get("schemeIdUri"), x.get("value")) == ("urn:tva:metadata:cs:AudioPurposeCS:2007", "2")
for x in adaptation_set.findall("Accessibility")
)
@staticmethod
def is_closed_caption(adaptation_set: Element) -> bool:
"""Check if contents of Adaptation Set is a Closed Caption Subtitle."""
return any(
(x.get("schemeIdUri"), x.get("value")) == ("urn:mpeg:dash:role:2011", "caption")
for x in adaptation_set.findall("Role")
)
@staticmethod
def get_ddp_complexity_index(adaptation_set: Element, representation: Optional[Element]) -> Optional[int]:
"""Get the DD+ Complexity Index (if any) from the AdaptationSet or Representation."""
return next((
int(x.get("value"))
for x in DASH._findall("SupplementalProperty", adaptation_set, representation, both=True)
if x.get("schemeIdUri") == "tag:dolby.com,2018:dash:EC3_ExtensionComplexityIndex:2018"
), None)
@staticmethod
def get_drm(protections: list[Element]) -> list[Widevine]:
2023-02-06 02:33:09 +00:00
drm = []
2023-02-06 02:33:09 +00:00
for protection in protections:
# TODO: Add checks for PlayReady, FairPlay, maybe more
urn = (protection.get("schemeIdUri") or "").lower()
if urn != WidevineCdm.urn:
continue
pssh = protection.findtext("pssh")
if not pssh:
continue
pssh = PSSH(pssh)
kid = protection.get("kid")
if kid:
kid = UUID(bytes=base64.b64decode(kid))
default_kid = protection.get("default_KID")
if default_kid:
kid = UUID(default_kid)
if not pssh.key_ids and not kid:
# weird manifest, look across all protections for a default_KID
kid = next((
UUID(protection.get("default_KID"))
for protection in protections
if protection.get("default_KID")
), None)
drm.append(Widevine(
pssh=pssh,
kid=kid
))
return drm
@staticmethod
def pt_to_sec(d: Union[str, float]) -> float:
if isinstance(d, float):
return d
has_ymd = d[0:8] == "P0Y0M0DT"
if d[0:2] != "PT" and not has_ymd:
raise ValueError("Input data is not a valid time string.")
if has_ymd:
d = d[6:].upper() # skip `P0Y0M0DT`
else:
d = d[2:].upper() # skip `PT`
m = re.findall(r"([\d.]+.)", d)
return sum(
float(x[0:-1]) * {"H": 60 * 60, "M": 60, "S": 1}[x[-1].upper()]
for x in m
)
@staticmethod
def replace_fields(url: str, **kwargs: Any) -> str:
for field, value in kwargs.items():
url = url.replace(f"${field}$", str(value))
m = re.search(fr"\${re.escape(field)}%([a-z0-9]+)\$", url, flags=re.I)
if m:
url = url.replace(m.group(), f"{value:{m.group(1)}}")
return url
2024-01-09 02:31:02 +00:00
__all__ = ("DASH",)