52 lines
1.0 KiB
Python
52 lines
1.0 KiB
Python
import webvtt
|
|
from schemas.transcript import CaptionSegment
|
|
from typing import List
|
|
|
|
|
|
def parse_vtt(path: str):
|
|
segments = []
|
|
|
|
for caption in webvtt.read(path):
|
|
segments.append(
|
|
CaptionSegment(
|
|
start=_to_seconds(caption.start),
|
|
end=_to_seconds(caption.end),
|
|
text=caption.text.strip(),
|
|
)
|
|
)
|
|
|
|
return dedupe_segments(segments)
|
|
|
|
|
|
def _to_seconds(ts: str) -> float:
|
|
h, m, rest = ts.split(":")
|
|
s, ms = rest.split(".")
|
|
return (
|
|
int(h) * 3600
|
|
+ int(m) * 60
|
|
+ int(s)
|
|
+ int(ms) / 1000
|
|
)
|
|
|
|
def dedupe_segments(segments: List[CaptionSegment]) -> List[CaptionSegment]:
|
|
cleaned = []
|
|
|
|
for seg in segments:
|
|
text = seg.text.strip()
|
|
if not text:
|
|
continue
|
|
|
|
if cleaned:
|
|
prev = cleaned[-1]
|
|
prev_text = prev.text.strip()
|
|
|
|
if prev_text and prev_text in text:
|
|
cleaned[-1] = seg
|
|
continue
|
|
|
|
cleaned.append(seg)
|
|
|
|
return cleaned
|
|
|
|
|