Last active
May 30, 2024 08:11
-
-
Save diatche/6186513d058521b6fd2a13705263516e to your computer and use it in GitHub Desktop.
Audio Transcriber using OpenAI Whisper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import subprocess | |
import os | |
import math | |
from openai import OpenAI | |
MAX_SIZE = 26214400 # Maximum file size (in bytes) | |
# Parse command line arguments | |
parser = argparse.ArgumentParser( | |
description="Transcribe an audio file using OpenAI Whisper." | |
) | |
parser.add_argument("file_path", type=str, help="Path to the audio or video file") | |
args = parser.parse_args() | |
# Get the API key from the environment | |
print("Getting API key from environment...") | |
api_key = os.getenv("OPENAI_API_KEY") | |
if api_key is None: | |
raise ValueError("Environment variable OPENAI_API_KEY is not set") | |
print("API key obtained.") | |
# Configure OpenAI with the API key | |
client = OpenAI(api_key=api_key) | |
def transcribe_file(file_path): | |
# Check if the file is a video file | |
if is_video_file(file_path): | |
file_path = handle_video_file(file_path) | |
# Calculate duration for each chunk | |
file_size = os.path.getsize(file_path) | |
num_chunks = math.ceil(file_size / MAX_SIZE) | |
if num_chunks > 1: | |
file_duration = get_audio_duration(file_path) | |
chunk_duration = ( | |
math.ceil(file_duration / num_chunks) + 10 | |
) # Add 10 seconds for overlap | |
transcripts = [] | |
chunk_files = [] # Store the paths of the chunk files | |
# Split the file into chunks and transcribe them sequentially | |
for i in range(num_chunks): | |
start_time = max( | |
0, i * (chunk_duration - 10) - 5 | |
) # Subtract 10 seconds for overlap | |
chunk_file_path = f"{file_path}_chunk{i}.mp3" | |
chunk_files.append(chunk_file_path) # Store the path of the chunk file | |
# Create chunk using ffmpeg | |
subprocess.run( | |
[ | |
"ffmpeg", | |
"-i", | |
file_path, | |
"-ss", | |
str(start_time), | |
"-t", | |
str(chunk_duration), | |
"-vn", | |
"-acodec", | |
"libmp3lame", | |
"-n", | |
chunk_file_path, | |
] | |
) | |
# Open the chunk file | |
print(f"Transcribing chunk {i+1}/{num_chunks}...") | |
with open(chunk_file_path, "rb") as audio_file: | |
# Transcribe the chunk | |
response = client.audio.transcriptions.create( | |
model="whisper-1", file=audio_file | |
) | |
# Extract the transcription text from the response | |
transcript = response.text | |
transcripts.append(transcript) | |
print(f"Chunk {i+1} transcribed.") | |
print(transcript) | |
# Combine transcripts | |
transcript = "\n\n".join(transcripts) | |
# Clean up the chunk files | |
for chunk_file in chunk_files: | |
os.remove(chunk_file) | |
else: | |
print(f"Transcribing...") | |
with open(file_path, "rb") as audio_file: | |
# Transcribe the chunk | |
response = client.audio.transcriptions.create( | |
model="whisper-1", file=audio_file | |
) | |
# Extract the transcription text from the response | |
transcript = response.text | |
print(f"Transcribed.") | |
return transcript | |
def get_audio_duration(file_path): | |
"""Get the duration of an audio file in seconds.""" | |
if not is_readable(file_path): | |
raise OSError("Unable to read from: " + file_path) | |
result = subprocess.run( | |
[ | |
"ffprobe", | |
"-v", | |
"error", | |
"-show_entries", | |
"format=duration", | |
"-of", | |
"default=noprint_wrappers=1:nokey=1", | |
file_path, | |
], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
) | |
if len(result.stdout) == 0: | |
raise ValueError("Unable to get audio duration: " + file_path) | |
duration = float(result.stdout) | |
return duration | |
def handle_video_file(video_file_path): | |
if not is_readable(video_file_path): | |
raise OSError("Unable to read from: " + video_file_path) | |
print("Extracting audio from video...") | |
audio_file_path = os.path.splitext(video_file_path)[0] + "_audio.mp3" | |
subprocess.run( | |
[ | |
"ffmpeg", | |
"-i", | |
video_file_path, | |
"-vn", | |
"-acodec", | |
"libmp3lame", | |
"-n", | |
audio_file_path, | |
] | |
) | |
print("Audio extracted from video.") | |
return audio_file_path | |
def is_video_file(file_path): | |
if not is_readable(file_path): | |
raise OSError("Unable to read from: " + file_path) | |
result = subprocess.run( | |
[ | |
"ffprobe", | |
"-v", | |
"error", | |
"-select_streams", | |
"v:0", | |
"-show_entries", | |
"stream=codec_name", | |
"-of", | |
"default=noprint_wrappers=1:nokey=1", | |
file_path, | |
], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
) | |
return len(result.stdout) > 0 | |
def is_readable(file_path): | |
try: | |
with open(file_path, "rb") as f: | |
return True | |
except IOError: | |
return False | |
# Transcribe | |
transcript = transcribe_file(args.file_path) | |
# Print the transcript | |
print("Transcript:") | |
print(transcript) | |
# Create the filename | |
filename = os.path.splitext(args.file_path)[0] + "_transcript.txt" | |
# Write the transcript to a file | |
print("Saving transcript to file...") | |
with open(filename, "w") as file: | |
file.write(transcript) | |
print("Transcript saved to file.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This script can be run from the command line as follows:
This will transcribe the audio file and write the transcript to a file in same directory as the input. The script will also print the transcript to the console. This depends on the
openai
library and you will need to haveffmpeg
installed.