YouTube Transcript Python: Complete Developer Guide [2026]

Python is the most popular language for extracting YouTube transcripts programmatically. The youtube-transcript-api library makes it simple to get transcripts without requiring YouTube Data API credentials. This guide covers everything from basic extraction to building production applications.

Quick Start: Get Your First Transcript

Installation

pip install youtube-transcript-api

Basic Usage

from youtube_transcript_api import YouTubeTranscriptApi

# Get transcript for a video
video_id = "dQw4w9WgXcQ"  # Extract from URL
transcript = YouTubeTranscriptApi.get_transcript(video_id)

# Print the transcript
for entry in transcript:
    print(f"[{entry['start']:.2f}] {entry['text']}")

Output Structure

# Each entry in the transcript list:
{
    'text': 'Hello and welcome to this video',
    'start': 0.0,        # Start time in seconds
    'duration': 4.5      # Duration in seconds
}

Extracting Video IDs from URLs

Helper Function

import re
from urllib.parse import urlparse, parse_qs

def extract_video_id(url: str) -> str:
    """Extract video ID from various YouTube URL formats."""
    
    # Handle different URL formats
    patterns = [
        r'(?:v=|/)([0-9A-Za-z_-]{11}).*',  # Standard and shared URLs
        r'(?:embed/)([0-9A-Za-z_-]{11})',    # Embed URLs
        r'(?:shorts/)([0-9A-Za-z_-]{11})',   # Shorts URLs
    ]
    
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    
    # Try parsing as query parameter
    parsed = urlparse(url)
    if parsed.query:
        params = parse_qs(parsed.query)
        if 'v' in params:
            return params['v'][0]
    
    raise ValueError(f"Could not extract video ID from: {url}")

# Usage
video_id = extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ")

Working with Multiple Languages

List Available Transcripts

from youtube_transcript_api import YouTubeTranscriptApi

video_id = "dQw4w9WgXcQ"

# Get list of available transcripts
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

for transcript in transcript_list:
    print(f"Language: {transcript.language}")
    print(f"Language Code: {transcript.language_code}")
    print(f"Is Generated: {transcript.is_generated}")
    print(f"Is Translatable: {transcript.is_translatable}")
    print("---")

Get Specific Language

# Get transcript in specific language
transcript = YouTubeTranscriptApi.get_transcript(
    video_id,
    languages=['en', 'en-US', 'en-GB']  # Priority order
)

# Or get auto-generated if manual not available
transcript = YouTubeTranscriptApi.get_transcript(
    video_id,
    languages=['en'],
    preserve_formatting=True
)

Translate Transcripts

from youtube_transcript_api import YouTubeTranscriptApi

video_id = "dQw4w9WgXcQ"
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

# Find English transcript and translate to Spanish
transcript = transcript_list.find_transcript(['en'])
translated = transcript.translate('es')
spanish_transcript = translated.fetch()

for entry in spanish_transcript:
    print(entry['text'])

Error Handling

Common Exceptions

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import (
    TranscriptsDisabled,
    NoTranscriptFound,
    VideoUnavailable,
    NoTranscriptAvailable
)

def get_transcript_safe(video_id: str, languages: list = ['en']) -> dict:
    """Safely get transcript with error handling."""
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages)
        return {
            'success': True,
            'transcript': transcript,
            'error': None
        }
    
    except TranscriptsDisabled:
        return {
            'success': False,
            'transcript': None,
            'error': 'Transcripts are disabled for this video'
        }
    
    except NoTranscriptFound:
        return {
            'success': False,
            'transcript': None,
            'error': f'No transcript found in languages: {languages}'
        }
    
    except VideoUnavailable:
        return {
            'success': False,
            'transcript': None,
            'error': 'Video is unavailable (private, deleted, or restricted)'
        }
    
    except NoTranscriptAvailable:
        return {
            'success': False,
            'transcript': None,
            'error': 'No transcripts available for this video'
        }
    
    except Exception as e:
        return {
            'success': False,
            'transcript': None,
            'error': f'Unexpected error: {str(e)}'
        }

Formatting Transcripts

Convert to Plain Text

def transcript_to_text(transcript: list, include_timestamps: bool = False) -> str:
    """Convert transcript to plain text."""
    lines = []
    
    for entry in transcript:
        if include_timestamps:
            # Format: [MM:SS] Text
            minutes = int(entry['start'] // 60)
            seconds = int(entry['start'] % 60)
            lines.append(f"[{minutes}:{seconds:02d}] {entry['text']}")
        else:
            lines.append(entry['text'])
    
    return '\n'.join(lines) if include_timestamps else ' '.join(lines)

Convert to SRT Format

def transcript_to_srt(transcript: list) -> str:
    """Convert transcript to SRT subtitle format."""
    srt_lines = []
    
    for i, entry in enumerate(transcript, 1):
        start = entry['start']
        end = start + entry['duration']
        
        # Format timestamps: HH:MM:SS,mmm
        start_str = format_srt_timestamp(start)
        end_str = format_srt_timestamp(end)
        
        srt_lines.append(str(i))
        srt_lines.append(f"{start_str} --> {end_str}")
        srt_lines.append(entry['text'])
        srt_lines.append('')  # Blank line separator
    
    return '\n'.join(srt_lines)

def format_srt_timestamp(seconds: float) -> str:
    """Format seconds as SRT timestamp."""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds % 1) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"

Convert to JSON

import json

def transcript_to_json(transcript: list, video_id: str = None) -> str:
    """Convert transcript to JSON format."""
    data = {
        'video_id': video_id,
        'transcript': transcript,
        'word_count': sum(len(entry['text'].split()) for entry in transcript),
        'duration': transcript[-1]['start'] + transcript[-1]['duration'] if transcript else 0
    }
    return json.dumps(data, indent=2)

Batch Processing

Process Multiple Videos

from youtube_transcript_api import YouTubeTranscriptApi
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

def batch_get_transcripts(video_ids: list, max_workers: int = 5) -> dict:
    """Get transcripts for multiple videos concurrently."""
    results = {}
    
    def get_single(video_id):
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id)
            return video_id, {'success': True, 'transcript': transcript}
        except Exception as e:
            return video_id, {'success': False, 'error': str(e)}
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_single, vid): vid for vid in video_ids}
        
        for future in as_completed(futures):
            video_id, result = future.result()
            results[video_id] = result
            time.sleep(0.1)  # Rate limiting
    
    return results

# Usage
video_ids = ["video1_id", "video2_id", "video3_id"]
results = batch_get_transcripts(video_ids)

for video_id, result in results.items():
    if result['success']:
        print(f"{video_id}: Got {len(result['transcript'])} segments")
    else:
        print(f"{video_id}: Error - {result['error']}")

Process YouTube Playlist

from pytube import Playlist  # pip install pytube

def get_playlist_transcripts(playlist_url: str) -> dict:
    """Get transcripts for all videos in a playlist."""
    playlist = Playlist(playlist_url)
    
    results = {}
    for video_url in playlist.video_urls:
        video_id = extract_video_id(video_url)
        results[video_id] = get_transcript_safe(video_id)
        time.sleep(0.5)  # Rate limiting
    
    return results

Building a Transcript Service

Flask API Example

from flask import Flask, request, jsonify
from youtube_transcript_api import YouTubeTranscriptApi

app = Flask(__name__)

@app.route('/api/transcript', methods=['GET'])
def get_transcript():
    video_id = request.args.get('video_id')
    language = request.args.get('language', 'en')
    format_type = request.args.get('format', 'json')
    
    if not video_id:
        return jsonify({'error': 'video_id required'}), 400
    
    result = get_transcript_safe(video_id, [language])
    
    if not result['success']:
        return jsonify({'error': result['error']}), 404
    
    transcript = result['transcript']
    
    if format_type == 'text':
        return transcript_to_text(transcript)
    elif format_type == 'srt':
        return transcript_to_srt(transcript)
    else:
        return jsonify({
            'video_id': video_id,
            'language': language,
            'transcript': transcript
        })

if __name__ == '__main__':
    app.run(debug=True)

CLI Tool Example

import argparse
from youtube_transcript_api import YouTubeTranscriptApi

def main():
    parser = argparse.ArgumentParser(description='YouTube Transcript Extractor')
    parser.add_argument('url', help='YouTube video URL')
    parser.add_argument('-l', '--language', default='en', help='Language code')
    parser.add_argument('-o', '--output', help='Output file path')
    parser.add_argument('-f', '--format', choices=['text', 'srt', 'json'], default='text')
    parser.add_argument('--timestamps', action='store_true', help='Include timestamps')
    
    args = parser.parse_args()
    
    video_id = extract_video_id(args.url)
    result = get_transcript_safe(video_id, [args.language])
    
    if not result['success']:
        print(f"Error: {result['error']}")
        return
    
    transcript = result['transcript']
    
    if args.format == 'srt':
        output = transcript_to_srt(transcript)
    elif args.format == 'json':
        output = transcript_to_json(transcript, video_id)
    else:
        output = transcript_to_text(transcript, args.timestamps)
    
    if args.output:
        with open(args.output, 'w', encoding='utf-8') as f:
            f.write(output)
        print(f"Saved to {args.output}")
    else:
        print(output)

if __name__ == '__main__':
    main()

Advanced Techniques

Caching Transcripts

import hashlib
import json
import os
from datetime import datetime, timedelta

class TranscriptCache:
    def __init__(self, cache_dir: str = '.transcript_cache', ttl_hours: int = 24):
        self.cache_dir = cache_dir
        self.ttl = timedelta(hours=ttl_hours)
        os.makedirs(cache_dir, exist_ok=True)
    
    def _get_cache_path(self, video_id: str) -> str:
        return os.path.join(self.cache_dir, f"{video_id}.json")
    
    def get(self, video_id: str) -> list | None:
        cache_path = self._get_cache_path(video_id)
        
        if not os.path.exists(cache_path):
            return None
        
        with open(cache_path, 'r') as f:
            data = json.load(f)
        
        cached_time = datetime.fromisoformat(data['timestamp'])
        if datetime.now() - cached_time > self.ttl:
            os.remove(cache_path)
            return None
        
        return data['transcript']
    
    def set(self, video_id: str, transcript: list):
        cache_path = self._get_cache_path(video_id)
        data = {
            'timestamp': datetime.now().isoformat(),
            'transcript': transcript
        }
        with open(cache_path, 'w') as f:
            json.dump(data, f)

# Usage
cache = TranscriptCache()
cached = cache.get(video_id)
if cached:
    transcript = cached
else:
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    cache.set(video_id, transcript)

Text Analysis

from collections import Counter
import re

def analyze_transcript(transcript: list) -> dict:
    """Analyze transcript for insights."""
    full_text = ' '.join(entry['text'] for entry in transcript)
    words = re.findall(r'\b\w+\b', full_text.lower())
    
    return {
        'total_segments': len(transcript),
        'total_words': len(words),
        'unique_words': len(set(words)),
        'duration_seconds': transcript[-1]['start'] + transcript[-1]['duration'],
        'words_per_minute': len(words) / (transcript[-1]['start'] / 60) if transcript else 0,
        'most_common_words': Counter(words).most_common(10)
    }

Frequently Asked Questions

Q1Do I need a YouTube API key?

No. The youtube-transcript-api library doesn't require API credentials—it accesses the same caption data that YouTube's website uses. This makes it simpler but means you're subject to YouTube's rate limits.

Q2How do I handle rate limiting?

Add delays between requests (0.5-1 second) and use exponential backoff on errors. For heavy usage, consider caching transcripts and spreading requests over time.

Q3Can I get transcripts for private videos?

No. The library can only access transcripts for public and unlisted videos. Private videos require authentication which this library doesn't support.

Q4Why do some videos return no transcript?

The video may have captions disabled, be too new for auto-captions, or be content like music that doesn't get auto-captioned. Check error messages to determine the cause.

Q5How accurate are the transcripts?

Accuracy depends on the source. Manual captions are 98%+ accurate. Auto-generated captions are 85-95% accurate depending on audio quality and speech clarity.

Conclusion

Python's youtube-transcript-api makes transcript extraction straightforward for developers. From simple single-video extraction to building production transcript services, the library provides all necessary functionality. Remember to handle errors gracefully, implement caching for performance, and respect YouTube's rate limits.

Quick reference:

# Install
pip install youtube-transcript-api

# Basic usage
from youtube_transcript_api import YouTubeTranscriptApi
transcript = YouTubeTranscriptApi.get_transcript("VIDEO_ID")

Start building your transcript applications today!

On This Page

YouTube Transcript Python: Complete Developer Guide [2026]

Key Takeaways

Quick Start: Get Your First Transcript

Installation

Basic Usage

Output Structure

Extracting Video IDs from URLs

Helper Function

Working with Multiple Languages

List Available Transcripts

Get Specific Language

Translate Transcripts

Error Handling

Common Exceptions

Formatting Transcripts

Convert to Plain Text

Convert to SRT Format

Convert to JSON

Batch Processing

Process Multiple Videos

Process YouTube Playlist

Building a Transcript Service

Flask API Example

CLI Tool Example

Advanced Techniques

Caching Transcripts

Text Analysis

Frequently Asked Questions

Conclusion

Written By

Sources & References

Related Resources

Free Tools

Use Cases

YouTube Transcript API 2026: Free Python Library (No API Key Required)

How to Get YouTube Transcript: Complete Guide for 2026

How to Download YouTube Transcript as Text File [2026]

Resources

Support

On This Page

YouTube Transcript Python: Complete Developer Guide [2026]

Key Takeaways

Quick Start: Get Your First Transcript

Installation

Basic Usage

Output Structure

Extracting Video IDs from URLs

Helper Function

Working with Multiple Languages

List Available Transcripts

Get Specific Language

Translate Transcripts

Error Handling

Common Exceptions

Formatting Transcripts

Convert to Plain Text

Convert to SRT Format

Convert to JSON

Batch Processing

Process Multiple Videos

Process YouTube Playlist

Building a Transcript Service

Flask API Example

CLI Tool Example

Advanced Techniques

Caching Transcripts

Text Analysis

Frequently Asked Questions

Conclusion

Written By

Sources & References

Related Resources

Free Tools

Use Cases

Related Articles

YouTube Transcript API 2026: Free Python Library (No API Key Required)

How to Get YouTube Transcript: Complete Guide for 2026

How to Download YouTube Transcript as Text File [2026]