#@title Whisper Transcription Parameters
model_size = "large-v2" # @param ["large-v3", "large-v2", "medium", "large"]
language = "japanese" # @param {type:"string"}
translation_mode = "End-to-end Whisper (default)" # @param ["End-to-end Whisper (default)", "Whisper -> DeepL", "No translation"]
# @markdown VAD settings and DeepL:
deepl_authkey = "" # @param {type:"string"}
source_separation = False # @param {type:"boolean"}
vad_threshold = 0.4 # @param {type:"number"}
chunk_threshold = 3.0 # @param {type:"number"}
deepl_target_lang = "EN-US" # @param {type:"string"}
max_attempts = 1 # @param {type:"integer"}
#@markdown Enter the values for the transcriber parameters. Leave unchanged if not sure.
verbose = False #@param {type:"boolean"}
temperature_input = "0.0" #@param {type:"string"}
compression_ratio_threshold = 2.4 #@param {type:"number"}
logprob_threshold = -1.0 #@param {type:"number"}
no_speech_threshold = 0.6 #@param {type:"number"}
condition_on_previous_text = False #@param {type:"boolean"}
initial_prompt = "" #@param {type:"string"}
word_timestamps = True #@param {type:"boolean"}
clip_timestamps_input = "0" #@param {type:"string"}
hallucination_silence_threshold = 2.0 #@param {type:"number"}
#@markdown Decoding Options (for advanced configurations, leave unchnaged if unsure):
best_of = 2 #@param {type:"number"}
beam_size = 2 #@param {type:"number"}
patience = 1 #@param {type:"number"}
length_penalty = "" #@param {type:"string"}
prefix = "" #@param {type:"string"}
suppress_tokens = "-1" #@param {type:"string"}
suppress_blank = True #@param {type:"boolean"}
without_timestamps = False #@param {type:"boolean"}
max_initial_timestamp = 1.0 #@param {type:"number"}
fp16 = True #@param {type:"boolean"}
# Parsing and converting form inputs
try:
temperature = tuple(float(temp.strip()) for temp in temperature_input.split(',')) if ',' in temperature_input else float(temperature_input)
except ValueError:
temperature = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0) # Default
clip_timestamps = clip_timestamps_input.split(',') if ',' in clip_timestamps_input else clip_timestamps_input
if clip_timestamps != "0":
try:
clip_timestamps = list(map(float, clip_timestamps)) if isinstance(clip_timestamps, list) else float(clip_timestamps)
except ValueError:
clip_timestamps = "0" # Default if parsing fails
language = None if not language else language
initial_prompt = None if initial_prompt == "" else initial_prompt
length_penalty = None if length_penalty == "" else float(length_penalty)
assert max_attempts >= 1
assert vad_threshold >= 0.01
assert chunk_threshold >= 0.1
assert language != ""
if translation_mode == "End-to-end Whisper (default)":
task = "translate"
run_deepl = False
elif translation_mode == "Whisper -> DeepL":
task = "transcribe"
run_deepl = True
elif translation_mode == "No translation":
task = "transcribe"
run_deepl = False
else:
raise ValueError("Invalid translation mode")
# Prepare transcription options
transcription_options = {
"verbose": verbose,
"compression_ratio_threshold": compression_ratio_threshold,
"logprob_threshold": logprob_threshold,
"no_speech_threshold": no_speech_threshold,
"condition_on_previous_text": condition_on_previous_text,
"initial_prompt": initial_prompt,
"word_timestamps": word_timestamps,
"clip_timestamps": clip_timestamps,
"hallucination_silence_threshold": hallucination_silence_threshold
}
# Prepare decoding options
decoding_options = {
"task": task,
"language": language,
"temperature": temperature,
"best_of": best_of,
"beam_size": beam_size,
"patience": patience,
"length_penalty": length_penalty,
"prefix": prefix,
"suppress_tokens": suppress_tokens,
"suppress_blank": suppress_blank,
"without_timestamps": without_timestamps,
"max_initial_timestamp": max_initial_timestamp,
"fp16": fp16,
}
#@markdown **Run Whisper**
# @markdown Required settings:
audio_path = "/content/drive/MyDrive/test.wav" # @param {type:"string"}
assert audio_path != ""
import tensorflow as tf
import torch
import whisper
import os
import ffmpeg
import srt
from tqdm import tqdm
import datetime
import deepl
import urllib.request
import json
from google.colab import files
if "http://" in audio_path or "https://" in audio_path:
print("Downloading audio...")
urllib.request.urlretrieve(audio_path, "input_file")
audio_path = "input_file"
*rest of the code here*