From 3adbc16dfb08318398cebeffd111fc5f68ed8007 Mon Sep 17 00:00:00 2001 From: BigDaddyAman <139612136+BigDaddyAman@users.noreply.github.com> Date: Mon, 2 Feb 2026 01:52:38 +0800 Subject: [PATCH] Initial release: YouTube Transcript API v1.0.0 --- .gitignore | 5 ++ CONTRIBUTING | 76 ++++++++++++++++++ Dockerfile | 19 +++++ LICENSE | 21 +++++ README.md | 176 ++++++++++++++++++++++++++++++++++++++++++ __init__.py | 0 app/__init__.py | 0 app/main.py | 19 +++++ core/config.py | 19 +++++ core/errors.py | 21 +++++ requirements.txt | 6 ++ routes/health.py | 8 ++ routes/transcript.py | 45 +++++++++++ schemas/error.py | 6 ++ schemas/transcript.py | 23 ++++++ services/captions.py | 51 ++++++++++++ services/metadata.py | 11 +++ services/ytdlp.py | 50 ++++++++++++ utils/filesystem.py | 5 ++ utils/validators.py | 28 +++++++ 20 files changed, 589 insertions(+) create mode 100644 .gitignore create mode 100644 CONTRIBUTING create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 README.md create mode 100644 __init__.py create mode 100644 app/__init__.py create mode 100644 app/main.py create mode 100644 core/config.py create mode 100644 core/errors.py create mode 100644 requirements.txt create mode 100644 routes/health.py create mode 100644 routes/transcript.py create mode 100644 schemas/error.py create mode 100644 schemas/transcript.py create mode 100644 services/captions.py create mode 100644 services/metadata.py create mode 100644 services/ytdlp.py create mode 100644 utils/filesystem.py create mode 100644 utils/validators.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6ac1cd7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.venv/ +__pycache__/ +*.pyc +.env +.DS_Store \ No newline at end of file diff --git a/CONTRIBUTING b/CONTRIBUTING new file mode 100644 index 0000000..644ac1b --- /dev/null +++ b/CONTRIBUTING @@ -0,0 +1,76 @@ +# Contributing + +Thanks for your interest in contributing! ๐ŸŽ‰ +This project aims to stay **simple, stable, and template-friendly**, so please read this first. + +--- + +## ๐Ÿงญ Project Principles + +- **Caption-only** (no audio/video downloads) +- **Stateless** (no database required) +- **Railway & Docker friendly** +- **Minimal dependencies** +- **Clear API contracts** + +Changes that break these principles are unlikely to be accepted. + +--- + +## ๐Ÿ› Reporting Bugs + +Please include: +- The YouTube URL used +- Expected vs actual behavior +- Logs or error messages +- Whether captions were human or auto-generated + +Open an issue with a clear title and reproduction steps. + +--- + +## โœจ Feature Requests + +Good feature requests: +- Improve caption parsing / cleanup +- Better validation or error messages +- Performance or stability improvements +- Optional flags that do NOT break defaults + +Please avoid: +- Adding mandatory databases +- Downloading media files +- Authentication requirements + +--- + +## ๐Ÿงช Development Setup + +```bash +python -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +pip install -r requirements.txt +uvicorn app.main:app --reload +``` + +Open: +``` +http://localhost:8000/docs +``` + +--- + +## ๐Ÿงน Code Style + +- Python 3.12+ compatible +- Use type hints +- Keep functions small and readable +- Avoid over-engineering + +--- + +## ๐Ÿ“œ License + +By contributing, you agree that your contributions will be licensed under the **MIT License**. + +Thank you for helping improve the project ๐Ÿ™Œ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cd5a75d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.13-slim + +ENV PYTHONUNBUFFERED=1 +ENV PIP_NO_CACHE_DIR=1 + +RUN apt-get update && apt-get install -y \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt . +RUN pip install -r requirements.txt + +COPY . . + +EXPOSE 8000 + +CMD ["sh", "-c", "uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000}"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..11618b8 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Aman + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..6b5aea7 --- /dev/null +++ b/README.md @@ -0,0 +1,176 @@ +![License](https://img.shields.io/badge/license-MIT-blue.svg) +![Python](https://img.shields.io/badge/python-3.13-blue) +![Framework](https://img.shields.io/badge/FastAPI-green) +![Docker](https://img.shields.io/badge/docker-supported-blue) + +# YouTube Transcript API + +A lightweight **FastAPI** service that extracts **YouTube video captions (no speech-to-text)**. +No video or audio downloads โ€” just clean, structured captions returned as JSON. + +Built to be simple, stateless, and easy to deploy anywhere. + +--- + +## โœจ Features + +* Extract **human or auto-generated captions** +* No media downloads (captions only) +* Clean JSON output with timestamps +* Accepts normal, playlist, and radio-style YouTube URLs (single video only) +* Docker friendly +* Built-in Swagger UI at `/docs` + +--- + +## ๐Ÿงช API Usage + +### Endpoint + +``` +POST /transcript +``` + +### Query Parameters + +| Name | Type | Required | Description | +| ----- | ------ | -------- | ----------------- | +| `url` | string | โœ… | YouTube video URL | + +Supported URLs: + +* `https://www.youtube.com/watch?v=VIDEO_ID` +* `https://www.youtube.com/watch?v=VIDEO_ID&list=RD...` +* `https://youtu.be/VIDEO_ID` + +--- + +### Example (curl) + +```bash +curl -X POST "http://localhost:8000/transcript?url=https://www.youtube.com/watch?v=PY9DcIMGxMs" +``` + +--- + +### Example Response + +```json +{ + "video": { + "id": "PY9DcIMGxMs", + "title": "Everything you think you know about addiction is wrong | TED", + "channel": "TED", + "duration": 882, + "url": "https://www.youtube.com/watch?v=PY9DcIMGxMs" + }, + "captions": [ + { + "start": 12.597, + "end": 14.338, + "text": "One of my earliest memories" + } + ], + "language": "auto", + "source": "human" +} +``` + +--- + +## ๐Ÿ“„ API Docs + +Once running, open: + +``` +/docs +``` + +Swagger UI is enabled by default. + +--- + +## ๐Ÿณ Run Locally with Docker + +### Build + +```bash +docker build -t youtube-transcript-api . +``` + +### Run + +```bash +docker run -p 8000:8000 youtube-transcript-api +``` + +Then open: + +``` +http://localhost:8000/docs +``` + +--- + +## โš™๏ธ Environment Variables (Optional) + +No environment variables are required. + +| Variable | Default | Description | +| ----------------- | ------- | ---------------------------------- | +| `PORT` | `8000` | Port to bind | +| `REQUEST_TIMEOUT` | `25` | yt-dlp execution timeout (seconds) | + +--- + +## ๐Ÿง  Design Notes + +* Uses `yt-dlp` **only for metadata and captions** +* No Redis, database, or background workers +* Fully stateless and container-friendly +* Designed to fail safely with clear error responses + +--- + +## โš ๏ธ Notes on Reliability + +This project depends on **YouTube availability and yt-dlp behavior**. + +On cloud platforms, requests may occasionally fail due to: + +* IP-based rate limiting +* YouTube bot detection +* regional consent or throttling + +When this happens, the API returns a structured error instead of crashing. + +--- + +## โš ๏ธ Limitations + +* Does **not** download audio or video +* Does **not** perform speech-to-text +* Captions must already exist on YouTube +* Shorts and embedded players are not a primary target + +--- + +## ๐Ÿ“œ License + +MIT License + +--- + +## ๐Ÿ™Œ Credits + +* FastAPI โ€” [https://fastapi.tiangolo.com/](https://fastapi.tiangolo.com/) +* yt-dlp โ€” [https://github.com/yt-dlp/yt-dlp](https://github.com/yt-dlp/yt-dlp) + +--- + +### โœ… Status + +* Docker tested +* Real-world URLs tested +* Cloud-friendly +* Ready for open-source use diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..31792d3 --- /dev/null +++ b/app/main.py @@ -0,0 +1,19 @@ +from fastapi import FastAPI +from routes.health import router as health_router +from routes.transcript import router as transcript_router + +app = FastAPI( + title="YouTube Transcript API", + description="Caption-only YouTube transcript extraction (no downloads)", + version="1.0.0", +) + +@app.get("/") +def root(): + return { + "name": "YouTube Transcript API", + "docs": "/docs" + } + +app.include_router(health_router) +app.include_router(transcript_router) diff --git a/core/config.py b/core/config.py new file mode 100644 index 0000000..7aa8ed1 --- /dev/null +++ b/core/config.py @@ -0,0 +1,19 @@ +from pydantic_settings import BaseSettings + + +class Settings(BaseSettings): + app_env: str = "production" + + request_timeout: int = 25 + max_video_duration: int = 7200 + + enable_redis: bool = False + enable_postgres: bool = False + enable_rate_limit: bool = False + + class Config: + env_file = ".env" + extra = "ignore" + + +settings = Settings() diff --git a/core/errors.py b/core/errors.py new file mode 100644 index 0000000..9d8b597 --- /dev/null +++ b/core/errors.py @@ -0,0 +1,21 @@ +from fastapi import HTTPException + + +def bad_request(message: str, code: str = "BAD_REQUEST"): + raise HTTPException( + status_code=400, + detail={ + "error": message, + "code": code, + }, + ) + + +def not_found(message: str, code: str = "NOT_FOUND"): + raise HTTPException( + status_code=404, + detail={ + "error": message, + "code": code, + }, + ) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2fe3644 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +fastapi==0.128.0 +uvicorn==0.40.0 +pydantic==2.12.5 +pydantic-settings==2.12.0 +webvtt-py==0.5.1 +yt-dlp==2026.1.31 diff --git a/routes/health.py b/routes/health.py new file mode 100644 index 0000000..b4cb63b --- /dev/null +++ b/routes/health.py @@ -0,0 +1,8 @@ +from fastapi import APIRouter + +router = APIRouter() + + +@router.get("/health") +def health(): + return {"status": "ok"} diff --git a/routes/transcript.py b/routes/transcript.py new file mode 100644 index 0000000..43be1a6 --- /dev/null +++ b/routes/transcript.py @@ -0,0 +1,45 @@ +from fastapi import APIRouter, Query +from utils.validators import validate_youtube_url +from utils.filesystem import temp_dir +from core.errors import not_found +from services.ytdlp import extract_metadata_and_captions +from services.captions import parse_vtt +from services.metadata import normalize_metadata +from schemas.transcript import TranscriptResponse + +router = APIRouter() + + +@router.post("/transcript", response_model=TranscriptResponse) +def transcript( + url: str = Query(..., description="YouTube video URL"), +): + validate_youtube_url(url) + + with temp_dir() as tmp: + metadata, caption_files = extract_metadata_and_captions(url, tmp) + + if not caption_files: + not_found("No captions available for this video", "NO_CAPTIONS") + + human = [p for p in caption_files if "auto" not in p.name.lower()] + auto = [p for p in caption_files if "auto" in p.name.lower()] + + if human: + caption_path = human[0] + source = "human" + elif auto: + caption_path = auto[0] + source = "auto" + else: + not_found("No captions available", "NO_CAPTIONS") + + captions = parse_vtt(str(caption_path)) + video = normalize_metadata(metadata) + + return TranscriptResponse( + video=video, + captions=captions, + language="auto", + source=source, + ) diff --git a/schemas/error.py b/schemas/error.py new file mode 100644 index 0000000..b69f7b4 --- /dev/null +++ b/schemas/error.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class ErrorResponse(BaseModel): + error: str + code: str diff --git a/schemas/transcript.py b/schemas/transcript.py new file mode 100644 index 0000000..0f4e288 --- /dev/null +++ b/schemas/transcript.py @@ -0,0 +1,23 @@ +from pydantic import BaseModel +from typing import List +from typing import List, Literal + +class CaptionSegment(BaseModel): + start: float + end: float + text: str + + +class VideoMetadata(BaseModel): + id: str + title: str + channel: str + duration: int + url: str + + +class TranscriptResponse(BaseModel): + video: VideoMetadata + captions: List[CaptionSegment] + language: str + source: Literal["human", "auto"] diff --git a/services/captions.py b/services/captions.py new file mode 100644 index 0000000..7a356e4 --- /dev/null +++ b/services/captions.py @@ -0,0 +1,51 @@ +import webvtt +from schemas.transcript import CaptionSegment +from typing import List + + +def parse_vtt(path: str): + segments = [] + + for caption in webvtt.read(path): + segments.append( + CaptionSegment( + start=_to_seconds(caption.start), + end=_to_seconds(caption.end), + text=caption.text.strip(), + ) + ) + + return dedupe_segments(segments) + + +def _to_seconds(ts: str) -> float: + h, m, rest = ts.split(":") + s, ms = rest.split(".") + return ( + int(h) * 3600 + + int(m) * 60 + + int(s) + + int(ms) / 1000 + ) + +def dedupe_segments(segments: List[CaptionSegment]) -> List[CaptionSegment]: + cleaned = [] + + for seg in segments: + text = seg.text.strip() + if not text: + continue + + if cleaned: + prev = cleaned[-1] + prev_text = prev.text.strip() + + if prev_text and prev_text in text: + cleaned[-1] = seg + continue + + cleaned.append(seg) + + return cleaned + + diff --git a/services/metadata.py b/services/metadata.py new file mode 100644 index 0000000..566409e --- /dev/null +++ b/services/metadata.py @@ -0,0 +1,11 @@ +from schemas.transcript import VideoMetadata + + +def normalize_metadata(raw: dict) -> VideoMetadata: + return VideoMetadata( + id=raw["id"], + title=raw["title"], + channel=raw.get("uploader", ""), + duration=raw.get("duration", 0), + url=raw["webpage_url"], + ) diff --git a/services/ytdlp.py b/services/ytdlp.py new file mode 100644 index 0000000..f72f30b --- /dev/null +++ b/services/ytdlp.py @@ -0,0 +1,50 @@ +import json +import subprocess +from pathlib import Path +from typing import Tuple, List + +from core.errors import bad_request +from core.config import settings + + +def extract_metadata_and_captions( + url: str, + workdir: str, +) -> Tuple[dict, List[Path]]: + cmd = [ + "yt-dlp", + "--skip-download", + "--write-subs", + "--write-auto-subs", + "--sub-format", "vtt", + "--no-playlist", + "--print-json", + "-o", f"{workdir}/%(id)s", + url, + ] + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=True, + timeout=settings.request_timeout, + ) + except subprocess.TimeoutExpired: + bad_request("yt-dlp timed out", "TIMEOUT") + except subprocess.CalledProcessError: + bad_request("Failed to extract video data", "YTDLP_ERROR") + + lines = result.stdout.splitlines() + if not lines: + bad_request("No metadata returned from yt-dlp", "EMPTY_RESPONSE") + + try: + metadata = json.loads(lines[0]) + except json.JSONDecodeError: + bad_request("Invalid metadata returned from yt-dlp", "INVALID_METADATA") + + subtitle_files = list(Path(workdir).glob("*.vtt")) + + return metadata, subtitle_files diff --git a/utils/filesystem.py b/utils/filesystem.py new file mode 100644 index 0000000..03c0170 --- /dev/null +++ b/utils/filesystem.py @@ -0,0 +1,5 @@ +from tempfile import TemporaryDirectory + + +def temp_dir(): + return TemporaryDirectory() diff --git a/utils/validators.py b/utils/validators.py new file mode 100644 index 0000000..378ef34 --- /dev/null +++ b/utils/validators.py @@ -0,0 +1,28 @@ +from urllib.parse import urlparse, parse_qs +from core.errors import bad_request + + +YOUTUBE_DOMAINS = ("youtube.com", "www.youtube.com", "youtu.be") + + +def validate_youtube_url(url: str): + try: + parsed = urlparse(url) + except Exception: + bad_request("Invalid YouTube URL", "INVALID_URL") + + if parsed.netloc not in YOUTUBE_DOMAINS: + bad_request("Invalid YouTube URL", "INVALID_URL") + + if parsed.netloc == "youtu.be": + if not parsed.path.strip("/"): + bad_request("Invalid YouTube video URL", "INVALID_URL") + return + + if parsed.path == "/watch": + qs = parse_qs(parsed.query) + if "v" not in qs or not qs["v"][0]: + bad_request("Invalid YouTube video URL", "INVALID_URL") + return + + bad_request("Invalid YouTube video URL", "INVALID_URL")