import webvtt from schemas.transcript import CaptionSegment from typing import List def parse_vtt(path: str): segments = [] for caption in webvtt.read(path): segments.append( CaptionSegment( start=_to_seconds(caption.start), end=_to_seconds(caption.end), text=caption.text.strip(), ) ) return dedupe_segments(segments) def _to_seconds(ts: str) -> float: h, m, rest = ts.split(":") s, ms = rest.split(".") return ( int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000 ) def dedupe_segments(segments: List[CaptionSegment]) -> List[CaptionSegment]: cleaned = [] for seg in segments: text = seg.text.strip() if not text: continue if cleaned: prev = cleaned[-1] prev_text = prev.text.strip() if prev_text and prev_text in text: cleaned[-1] = seg continue cleaned.append(seg) return cleaned