Created
September 9, 2024 14:39
-
-
Save joshuacrowley/d4879dfe7e39edf86bc07bffb5749126 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
export const speakText = async (text: string, apiKey: string): Promise<void> => { | |
let ws: WebSocket | null = null; | |
let audioContext: AudioContext | null = null; | |
let mediaStreamDestination: MediaStreamAudioDestinationNode | null = null; | |
let scriptProcessorNode: ScriptProcessorNode | null = null; | |
let audioElement: HTMLAudioElement | null = null; | |
let buffer: Float32Array = new Float32Array(); | |
const API_VERSION = "2024-06-10"; | |
const SAMPLE_RATE = 44100; | |
try { | |
// Initialize AudioContext | |
audioContext = new (window.AudioContext || | |
(window as any).webkitAudioContext)(); | |
mediaStreamDestination = audioContext.createMediaStreamDestination(); | |
scriptProcessorNode = audioContext.createScriptProcessor(4096, 1, 1); | |
scriptProcessorNode.onaudioprocess = (audioProcessingEvent) => { | |
const outputBuffer = audioProcessingEvent.outputBuffer.getChannelData(0); | |
if (buffer.length >= outputBuffer.length) { | |
outputBuffer.set(buffer.subarray(0, outputBuffer.length)); | |
buffer = buffer.subarray(outputBuffer.length); | |
} else { | |
outputBuffer.set(buffer); | |
outputBuffer.fill(0, buffer.length); | |
buffer = new Float32Array(); | |
} | |
}; | |
scriptProcessorNode.connect(mediaStreamDestination); | |
// Set up audio element | |
audioElement = new Audio(); | |
audioElement.srcObject = mediaStreamDestination.stream; | |
await audioElement.play(); | |
// Connect WebSocket | |
const wsUrl = `wss://api.cartesia.ai/tts/websocket?api_key=${apiKey}&cartesia_version=${API_VERSION}`; | |
ws = new WebSocket(wsUrl); | |
await new Promise<void>((resolve, reject) => { | |
if (!ws) return reject(new Error("WebSocket is null")); | |
ws.onopen = () => { | |
console.log("WebSocket connected"); | |
resolve(); | |
}; | |
ws.onerror = (error) => { | |
console.error("WebSocket error:", error); | |
reject(new Error("WebSocket connection failed")); | |
}; | |
ws.onmessage = (event) => { | |
const response = JSON.parse(event.data); | |
if (response.type === "chunk" && response.data) { | |
playAudioChunk(response.data); | |
} | |
}; | |
}); | |
// Send text in chunks | |
const chunks = text.split(/(?<=[.!?])\s+/); | |
for (let i = 0; i < chunks.length; i++) { | |
const chunk = chunks[i]; | |
const isLast = i === chunks.length - 1; | |
const request: TTSRequest = { | |
context_id: "happy-monkeys-fly", | |
model_id: "sonic-english", | |
transcript: i === 0 ? chunk + " " : chunk, | |
duration: 180, | |
voice: { | |
mode: "id", | |
id: "87748186-23bb-4158-a1eb-332911b0b708", | |
}, | |
output_format: { | |
container: "raw", | |
encoding: "pcm_s16le", | |
sample_rate: SAMPLE_RATE, | |
}, | |
language: "en", | |
add_timestamps: false, | |
continue: !isLast, | |
}; | |
ws.send(JSON.stringify(request)); | |
} | |
// Wait for audio to finish playing | |
await new Promise<void>((resolve) => { | |
if (audioElement) { | |
audioElement.onended = () => resolve(); | |
} | |
}); | |
} catch (error) { | |
console.error("Error in text-to-speech:", error); | |
throw error; | |
} finally { | |
// Clean up resources | |
if (ws) ws.close(); | |
if (scriptProcessorNode) scriptProcessorNode.disconnect(); | |
if (audioElement) { | |
audioElement.pause(); | |
audioElement.srcObject = null; | |
} | |
if (audioContext) await audioContext.close(); | |
} | |
function playAudioChunk(base64Data: string) { | |
const arrayBuffer = Uint8Array.from(atob(base64Data), (c) => | |
c.charCodeAt(0) | |
).buffer; | |
const int16Array = new Int16Array(arrayBuffer); | |
const floatArray = new Float32Array(int16Array.length); | |
for (let i = 0; i < int16Array.length; i++) { | |
floatArray[i] = int16Array[i] / 32768.0; | |
} | |
const resampledBuffer = resampleBuffer( | |
floatArray, | |
SAMPLE_RATE, | |
audioContext!.sampleRate | |
); | |
buffer = concatenateBuffers(buffer, resampledBuffer); | |
} | |
function resampleBuffer( | |
inputBuffer: Float32Array, | |
inputSampleRate: number, | |
outputSampleRate: number | |
) { | |
const ratio = outputSampleRate / inputSampleRate; | |
const outputLength = Math.ceil(inputBuffer.length * ratio); | |
const outputBuffer = new Float32Array(outputLength); | |
for (let i = 0; i < outputLength; i++) { | |
const inputIndex = i / ratio; | |
const inputIndexFloor = Math.floor(inputIndex); | |
const inputIndexCeil = Math.ceil(inputIndex); | |
const fraction = inputIndex - inputIndexFloor; | |
if (inputIndexCeil < inputBuffer.length) { | |
outputBuffer[i] = | |
(1 - fraction) * inputBuffer[inputIndexFloor] + | |
fraction * inputBuffer[inputIndexCeil]; | |
} else { | |
outputBuffer[i] = inputBuffer[inputIndexFloor]; | |
} | |
} | |
return outputBuffer; | |
} | |
function concatenateBuffers(buffer1: Float32Array, buffer2: Float32Array) { | |
const result = new Float32Array(buffer1.length + buffer2.length); | |
result.set(buffer1, 0); | |
result.set(buffer2, buffer1.length); | |
return result; | |
} | |
}; | |
interface TTSRequest { | |
context_id: string; | |
model_id: string; | |
transcript: string; | |
duration: number; | |
voice: { | |
mode: string; | |
id: string; | |
}; | |
output_format: { | |
container: string; | |
encoding: string; | |
sample_rate: number; | |
}; | |
language: string; | |
add_timestamps: boolean; | |
continue: boolean; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment