youtube-transcript-api/services/captions.py

import webvtt
from schemas.transcript import CaptionSegment
from typing import List


def parse_vtt(path: str):
    segments = []

    for caption in webvtt.read(path):
        segments.append(
            CaptionSegment(
                start=_to_seconds(caption.start),
                end=_to_seconds(caption.end),
                text=caption.text.strip(),
            )
        )

    return dedupe_segments(segments)


def _to_seconds(ts: str) -> float:
    h, m, rest = ts.split(":")
    s, ms = rest.split(".")
    return (
        int(h) * 3600
        + int(m) * 60
        + int(s)
        + int(ms) / 1000
    )

def dedupe_segments(segments: List[CaptionSegment]) -> List[CaptionSegment]:
    cleaned = []

    for seg in segments:
        text = seg.text.strip()
        if not text:
            continue

        if cleaned:
            prev = cleaned[-1]
            prev_text = prev.text.strip()

            if prev_text and prev_text in text:
                cleaned[-1] = seg
                continue

        cleaned.append(seg)

    return cleaned