youtube-transcript-api/services/captions.py

52 lines
1.0 KiB
Python

import webvtt
from schemas.transcript import CaptionSegment
from typing import List
def parse_vtt(path: str):
segments = []
for caption in webvtt.read(path):
segments.append(
CaptionSegment(
start=_to_seconds(caption.start),
end=_to_seconds(caption.end),
text=caption.text.strip(),
)
)
return dedupe_segments(segments)
def _to_seconds(ts: str) -> float:
h, m, rest = ts.split(":")
s, ms = rest.split(".")
return (
int(h) * 3600
+ int(m) * 60
+ int(s)
+ int(ms) / 1000
)
def dedupe_segments(segments: List[CaptionSegment]) -> List[CaptionSegment]:
cleaned = []
for seg in segments:
text = seg.text.strip()
if not text:
continue
if cleaned:
prev = cleaned[-1]
prev_text = prev.text.strip()
if prev_text and prev_text in text:
cleaned[-1] = seg
continue
cleaned.append(seg)
return cleaned