Skip to content

Instantly share code, notes, and snippets.

@swombat
Created August 11, 2024 07:42
Show Gist options
  • Save swombat/d8b7eb339854aa2804b0c12b437e201a to your computer and use it in GitHub Desktop.
Save swombat/d8b7eb339854aa2804b0c12b437e201a to your computer and use it in GitHub Desktop.
Transcribe via whisper-1
#!/usr/bin/env ruby
require 'open3'
require 'signal'
require 'openai'
# Define the silence threshold and duration for detection
@silence_threshold = '-30dB'
@silence_duration = 0.5
class AudioTranscriberApi
def initialize(access_token:)
@access_token = access_token
@client = OpenAI::Client.new(
access_token: @access_token,
request_timeout: 20
)
end
def transcribe(audio_file)
retries = 0
max_retries = 5
begin
parameters = {
model: "whisper-1",
file: File.open(audio_file, "rb"),
language: "en"
}
response = @client.audio.transcribe(
parameters: parameters
)
response["text"]
rescue StandardError => e
puts "Error transcribing audio file: #{e.message}"
if retries < max_retries
retries += 1
puts "Retrying audio transcription for #{audio_file} (#{retries} times) after exponential backoff (#{2 ** retries} seconds)"
sleep 2 ** retries
else
raise e
end
retry
end
end
end
def filename(file_index)
"output_#{file_index.to_s.rjust(3, '0')}.wav"
end
def start_ffmpeg(file_index)
file_name = filename(file_index)
sox_cmd = [
'sox',
'-t', 'coreaudio',
'-d', # Use default audio input device
'-c', '2', # Stereo channels
'-r', '44100', # Sample rate
'-b', '16', # Bit depth
'-e', 'signed-integer',
'-t', 'wav', # Output as WAV format for piping
'-'
]
ffmpeg_cmd = [
'ffmpeg',
'-f', 'wav',
'-i', '-', # Input from sox via pipe
'-af', "silencedetect=n=#{@silence_threshold}:d=#{@silence_duration}",
'-c:a', 'pcm_s16le',
'-y', # Overwrite output files
file_name
]
# Start sox and pipe its output to ffmpeg
sox_stdin, sox_stdout, sox_stderr, sox_wait_thr = Open3.popen3(*sox_cmd)
stdin, stdout, stderr, ffmpeg_wait_thr = Open3.popen3(*ffmpeg_cmd)
# Redirect sox output to ffmpeg input
Thread.new do
while (line = sox_stdout.gets)
stdin.puts line
end
end
# Return the threads and process handles
return sox_stdin, stdout, stderr, sox_wait_thr, ffmpeg_wait_thr
end
def transcribe_audio(file_index)
Thread.new do
file_name = filename(file_index)
transcriber = AudioTranscriberApi.new(access_token: ENV['OPENAI_API_KEY'])
transcription = transcriber.transcribe(file_name)
puts transcription
File.delete(file_name)
end
end
# Monitor for silence detection in stderr
def monitor_for_silence
Thread.new do
@stderr.each do |line|
if line.include?("silence_start") && !line.include?("silence_start: 0")
# Gracefully interrupt the current ffmpeg process
Process.kill("TERM", @sox_wait_thr.pid) if @sox_wait_thr.alive?
Process.kill("TERM", @ffmpeg_wait_thr.pid) if @ffmpeg_wait_thr.alive?
# Start a new recording
transcribe_audio(@file_index)
@file_index += 1
@stdin, @stdout, @stderr, @sox_wait_thr, @ffmpeg_wait_thr = start_ffmpeg(@file_index)
monitor_for_silence
break
end
end
end
end
# Start the initial recording
@file_index = 1
@stdin, @stdout, @stderr, @sox_wait_thr, @ffmpeg_wait_thr = start_ffmpeg(@file_index)
monitor_for_silence
# Keep the main thread alive to handle signals
sleep
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment