scripts/transcribe at main · ellite/scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
"""
GPU Whisper transcription / translation (v2)
────────────────────────────────────────────
• Lists media files for selection
• Auto language detection
• Accuracy modes: Fast / Balanced / Max
• Optional English translation
• Real-time output with word-level timestamps
• Uses GPU (float16)
• Defaults to: large-v3-turbo (if available)
"""

import os
import time
from datetime import timedelta
from faster_whisper import WhisperModel


# ---------- Helpers ----------

def list_media_files():
    files = [f for f in sorted(os.listdir()) if f.lower().endswith(('.mkv', '.mp4', '.mp3', '.wav', '.m4a'))]
    for idx, file in enumerate(files, 1):
        print(f"{idx}: {file}")
    return files


def get_user_selection(files):
    selection = input("Enter numbers of files to transcribe (comma-separated), or press Enter for all: ").strip()
    if not selection:
        return files
    idxs = [int(i) - 1 for i in selection.split(",") if i.strip().isdigit() and 0 < int(i) <= len(files)]
    return [files[i] for i in idxs]


def choose_accuracy():
    print("\nAccuracy mode:")
    print("1) ⚡ Fast (small-v3)")
    print("2) ⚖️  Balanced (medium-v3)")
    print("3) 🧠 Max accuracy (large-v3-turbo if available, else large-v3) [default]")
    choice = input("Enter number [default: 3]: ").strip()
    return {"1": "small-v3", "2": "medium-v3", "3": "large-v3-turbo"}.get(choice, "large-v3-turbo")


def choose_task():
    print("\nTask options:")
    print("1) Transcribe only")
    print("2) Transcribe + Translate to English")
    choice = input("Enter number [default: 1]: ").strip()
    return "both" if choice == "2" else "transcribe"

def format_time(seconds):
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    ms = (seconds - int(seconds)) * 1000
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(ms):03d}"


def write_srt(filename, segments):
    with open(filename, "w", encoding="utf-8") as srt_file:
        for i, seg in enumerate(segments, start=1):
            srt_file.write(f"{i}\n{format_time(seg.start)} --> {format_time(seg.end)}\n{seg.text.strip()}\n\n")
    print(f"✅ Saved subtitles to: {filename}")

# ---------- Core ----------

def transcribe_file(input_file, model, task_mode):
    overall_start = time.time()
    print(f"\n🎧 Processing: {input_file}")
    base = os.path.splitext(input_file)[0]

    # ----- Detect language -----
    print("🌍 Detecting language…")
    _, info = model.transcribe(input_file, beam_size=1, vad_filter=True, task="transcribe", max_initial_timestamp=30)
    print(f"🌍 Detected language: {info.language.upper()} ({info.language_probability * 100:.0f}%)")

    # ----- Transcription -----
    print("📝 Transcribing with word-level precision…")
    trans_start = time.time()

    segments_gen, _ = model.transcribe(
        input_file,
        beam_size=5,
        vad_filter=True,
        word_timestamps=True,
        task="transcribe"
    )

    srt_path = f"{base}.{info.language}.srt"
    with open(srt_path, "w", encoding="utf-8") as srt_file:
        for i, seg in enumerate(segments_gen, start=1):
            start_time = format_time(seg.start)
            end_time = format_time(seg.end)
            text = seg.text.strip()
            print(f"[{start_time} - {end_time}] {text}")
            srt_file.write(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")

    trans_end = time.time()
    print(f"✅ Saved subtitles to: {srt_path}")
    print(f"🕒 Transcription completed in {trans_end - trans_start:.1f} seconds")

    # ----- Optional translation -----
    if task_mode == "both":
        if info.language.lower().startswith("en"):
            print("⚠️  Skipping translation — source language already English.")
        else:
            print("\n🌐 Translating to English…")
            transl_start = time.time()

            translated_gen, _ = model.transcribe(
                input_file,
                beam_size=5,
                vad_filter=True,
                word_timestamps=True,
                task="translate"
            )
            translated_segments = list(translated_gen)
            transl_end = time.time()

            t_srt = f"{base}.en.srt"
            write_srt(t_srt, translated_segments)
            print(f"🕒 Translation completed in {transl_end - trans_end:.1f} seconds")

    overall_end = time.time()
    total_time = overall_end - overall_start
    print(f"🕒 Total processing time: {total_time:.1f} seconds\n")


# ---------- Main ----------

def main():
    files = list_media_files()
    if not files:
        print("No media files found.")
        return

    selected = get_user_selection(files)
    model_size = choose_accuracy()
    task_mode = choose_task()

    print(f"\n🔁 Loading model {model_size} on GPU (float16)…")
    try:
        model = WhisperModel(model_size, device="cuda", compute_type="float16")
    except Exception as e:
        if "large-v3-turbo" in model_size:
            print("⚠️ large-v3-turbo not found, falling back to large-v3")
            model = WhisperModel("large-v3", device="cuda", compute_type="float16")
        else:
            raise e

    for f in selected:
        transcribe_file(f, model, task_mode)


if __name__ == "__main__":
    main()