Skip to content

Instantly share code, notes, and snippets.

@joshuacrowley
Created September 9, 2024 14:39
Show Gist options
  • Save joshuacrowley/d4879dfe7e39edf86bc07bffb5749126 to your computer and use it in GitHub Desktop.
Save joshuacrowley/d4879dfe7e39edf86bc07bffb5749126 to your computer and use it in GitHub Desktop.
export const speakText = async (text: string, apiKey: string): Promise<void> => {
let ws: WebSocket | null = null;
let audioContext: AudioContext | null = null;
let mediaStreamDestination: MediaStreamAudioDestinationNode | null = null;
let scriptProcessorNode: ScriptProcessorNode | null = null;
let audioElement: HTMLAudioElement | null = null;
let buffer: Float32Array = new Float32Array();
const API_VERSION = "2024-06-10";
const SAMPLE_RATE = 44100;
try {
// Initialize AudioContext
audioContext = new (window.AudioContext ||
(window as any).webkitAudioContext)();
mediaStreamDestination = audioContext.createMediaStreamDestination();
scriptProcessorNode = audioContext.createScriptProcessor(4096, 1, 1);
scriptProcessorNode.onaudioprocess = (audioProcessingEvent) => {
const outputBuffer = audioProcessingEvent.outputBuffer.getChannelData(0);
if (buffer.length >= outputBuffer.length) {
outputBuffer.set(buffer.subarray(0, outputBuffer.length));
buffer = buffer.subarray(outputBuffer.length);
} else {
outputBuffer.set(buffer);
outputBuffer.fill(0, buffer.length);
buffer = new Float32Array();
}
};
scriptProcessorNode.connect(mediaStreamDestination);
// Set up audio element
audioElement = new Audio();
audioElement.srcObject = mediaStreamDestination.stream;
await audioElement.play();
// Connect WebSocket
const wsUrl = `wss://api.cartesia.ai/tts/websocket?api_key=${apiKey}&cartesia_version=${API_VERSION}`;
ws = new WebSocket(wsUrl);
await new Promise<void>((resolve, reject) => {
if (!ws) return reject(new Error("WebSocket is null"));
ws.onopen = () => {
console.log("WebSocket connected");
resolve();
};
ws.onerror = (error) => {
console.error("WebSocket error:", error);
reject(new Error("WebSocket connection failed"));
};
ws.onmessage = (event) => {
const response = JSON.parse(event.data);
if (response.type === "chunk" && response.data) {
playAudioChunk(response.data);
}
};
});
// Send text in chunks
const chunks = text.split(/(?<=[.!?])\s+/);
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
const isLast = i === chunks.length - 1;
const request: TTSRequest = {
context_id: "happy-monkeys-fly",
model_id: "sonic-english",
transcript: i === 0 ? chunk + " " : chunk,
duration: 180,
voice: {
mode: "id",
id: "87748186-23bb-4158-a1eb-332911b0b708",
},
output_format: {
container: "raw",
encoding: "pcm_s16le",
sample_rate: SAMPLE_RATE,
},
language: "en",
add_timestamps: false,
continue: !isLast,
};
ws.send(JSON.stringify(request));
}
// Wait for audio to finish playing
await new Promise<void>((resolve) => {
if (audioElement) {
audioElement.onended = () => resolve();
}
});
} catch (error) {
console.error("Error in text-to-speech:", error);
throw error;
} finally {
// Clean up resources
if (ws) ws.close();
if (scriptProcessorNode) scriptProcessorNode.disconnect();
if (audioElement) {
audioElement.pause();
audioElement.srcObject = null;
}
if (audioContext) await audioContext.close();
}
function playAudioChunk(base64Data: string) {
const arrayBuffer = Uint8Array.from(atob(base64Data), (c) =>
c.charCodeAt(0)
).buffer;
const int16Array = new Int16Array(arrayBuffer);
const floatArray = new Float32Array(int16Array.length);
for (let i = 0; i < int16Array.length; i++) {
floatArray[i] = int16Array[i] / 32768.0;
}
const resampledBuffer = resampleBuffer(
floatArray,
SAMPLE_RATE,
audioContext!.sampleRate
);
buffer = concatenateBuffers(buffer, resampledBuffer);
}
function resampleBuffer(
inputBuffer: Float32Array,
inputSampleRate: number,
outputSampleRate: number
) {
const ratio = outputSampleRate / inputSampleRate;
const outputLength = Math.ceil(inputBuffer.length * ratio);
const outputBuffer = new Float32Array(outputLength);
for (let i = 0; i < outputLength; i++) {
const inputIndex = i / ratio;
const inputIndexFloor = Math.floor(inputIndex);
const inputIndexCeil = Math.ceil(inputIndex);
const fraction = inputIndex - inputIndexFloor;
if (inputIndexCeil < inputBuffer.length) {
outputBuffer[i] =
(1 - fraction) * inputBuffer[inputIndexFloor] +
fraction * inputBuffer[inputIndexCeil];
} else {
outputBuffer[i] = inputBuffer[inputIndexFloor];
}
}
return outputBuffer;
}
function concatenateBuffers(buffer1: Float32Array, buffer2: Float32Array) {
const result = new Float32Array(buffer1.length + buffer2.length);
result.set(buffer1, 0);
result.set(buffer2, buffer1.length);
return result;
}
};
interface TTSRequest {
context_id: string;
model_id: string;
transcript: string;
duration: number;
voice: {
mode: string;
id: string;
};
output_format: {
container: string;
encoding: string;
sample_rate: number;
};
language: string;
add_timestamps: boolean;
continue: boolean;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment