Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for captions/subtitle generation via transformers.js #4231

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions packages/transformers-js/.eslintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"extends": "@jonny"
}
18 changes: 18 additions & 0 deletions packages/transformers-js/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# @remotion/transformers-js

Helpers for installing and using Whisper.cpp

[![NPM Downloads](https://img.shields.io/npm/dm/@remotion/install-whisper-cpp.svg?style=flat&color=black&label=Downloads)](https://npmcharts.com/compare/@remotion/install-whisper-cpp?minimal=true)

## Installation

```bash
npm install @remotion/transformers-js --save-exact
```

When installing a Remotion package, make sure to align the version of all `remotion` and `@remotion/*` packages to the same version.
Remove the `^` character from the version number to use the exact version.

## Usage

See the [documentation](https://www.remotion.dev/docs/transformers-js) for more information.
40 changes: 40 additions & 0 deletions packages/transformers-js/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"repository": {
"url": "https://github.com/remotion-dev/remotion/tree/main/packages/transformers-js"
},
"name": "@remotion/transformers-js",
"version": "4.0.202",
"description": "Generate Captions for Remotion Videos using Whisper via transformers.js",
"main": "dist/index.js",
"sideEffects": false,
"bugs": {
"url": "https://github.com/remotion-dev/remotion/issues"
},
"scripts": {
"formatting": "prettier src --check",
"lint": "eslint src --ext ts,tsx",
"test": "bun test src"
},
"files": [
"dist"
],
"author": "David Rechkemmer",
"license": "SEE LICENSE IN LICENSE.md",
"dependencies": {
"@xenova/transformers": "^2.17.2"
},
"peerDependencies": {
"react": ">=16.8.0",
"react-dom": ">=16.8.0"
},
"keywords": [
"remotion",
"openai",
"whisper",
"transformers.js"
],
"publishConfig": {
"access": "public"
},
"homepage": "https://www.remotion.dev/docs/transformers-js"
}
36 changes: 36 additions & 0 deletions packages/transformers-js/src/decode.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/**
* Decode audio data from a file, blob, or array buffer
* @param data - File | Blob | ArrayBuffer
* @returns raw audio data
* @throws Error if decoding fails
*/
export const decodeAudioData = async (data: File | Blob | ArrayBuffer) => {
const audioContext = new window.AudioContext({
sampleRate: 16_000,
});

try {
if (data instanceof Blob || data instanceof File) {
data = await data.arrayBuffer();
}

const audioBuffer = await audioContext.decodeAudioData(data);
let audio: Float32Array;
if (audioBuffer.numberOfChannels === 2) {
// Merge channels
const SCALING_FACTOR = Math.sqrt(2);
const left = audioBuffer.getChannelData(0);
const right = audioBuffer.getChannelData(1);
audio = new Float32Array(left.length);
for (let i = 0; i < audioBuffer.length; ++i) {
audio[i] = (SCALING_FACTOR * (left[i] + right[i])) / 2;
}
} else {
audio = audioBuffer.getChannelData(0);
}

return audio;
} catch (e) {
throw new Error(`Failed to decode audio: ${e}`);
}
};
Empty file.
15 changes: 15 additions & 0 deletions packages/transformers-js/src/util.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

export async function hasWebGPU() {
// @ts-expect-error
if (!navigator.gpu) {
return false;
}

try {
// @ts-expect-error
const adapter = await navigator.gpu.requestAdapter();
return Boolean(adapter);
} catch (e) {
return false;
}
}
137 changes: 137 additions & 0 deletions packages/transformers-js/src/worker.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import type {
AutomaticSpeechRecognitionOutput,
AutomaticSpeechRecognitionPipeline,
} from '@xenova/transformers';
import { pipeline } from '@xenova/transformers';

export interface DeviceConfig {
dtype: string | {[key: string]: string};
device: string;
}

export interface MessageData {
device: string;
audio: Float32Array;
language: string;
}

export interface ClientMessage {
type: string;
data: string | MessageData;
}

export interface WorkerResponse {
status: string;
data?:
| string
| {
result:
| AutomaticSpeechRecognitionOutput
| AutomaticSpeechRecognitionOutput[];
time: number;
};
}

const PER_DEVICE_CONFIG: {[key: string]: DeviceConfig} = {
webgpu: {
dtype: {
encoder_model: 'fp32',
decoder_model_merged: 'q4',
},
device: 'webgpu',
},
wasm: {
dtype: 'q8',
device: 'wasm',
},
};

/**
* This class uses the Singleton pattern to ensure that only one instance of the model is loaded.
*/
class PipelineSingeton {
static model_id = 'onnx-community/whisper-base_timestamped';
static instance: AutomaticSpeechRecognitionPipeline;

static async getInstance(
progress_callback: Function | undefined = undefined,
device = 'webgpu',
) {
if (!this.instance) {
this.instance = await pipeline(
'automatic-speech-recognition',
this.model_id,
{
...PER_DEVICE_CONFIG[device],
progress_callback,
},
);
}

return this.instance;
}
}

async function load({ device }: MessageData) {
self.postMessage({
status: 'loading',
data: `Loading model (${device})...`,
});

// Load the pipeline and save it for future use.
const transcriber = await PipelineSingeton.getInstance((x: MessageData) => {
// We also add a progress callback to the pipeline so that we can
// track model loading.
self.postMessage(x);
}, device);

if (device === 'webgpu') {
self.postMessage({
status: 'loading',
data: 'Compiling shaders and warming up model...',
});
await transcriber(new Float32Array(16_000), {
language: 'en',
});
}

self.postMessage({
status: 'ready',
data: 'Model loaded successfully!'
});
}

async function run({audio, language}: MessageData) {
const transcriber = await PipelineSingeton.getInstance();

const start = performance.now();

const result = await transcriber(audio, {
language,
return_timestamps: 'word',
chunk_length_s: 30,
});

const end = performance.now();

self.postMessage({status: 'complete', data: {result, time: end - start}});
}


// Listen for messages from the main thread
self.addEventListener('message', async (e: any) => {
const {type, data} = e.data as ClientMessage;

switch (type) {
case 'load':
load(data as MessageData);
break;

case 'run':
run(data as MessageData);
break;

default:
self.postMessage({status: 'error', data: 'Unknown message type'});
}
});
8 changes: 8 additions & 0 deletions packages/transformers-js/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"extends": "../tsconfig.settings.json",
"compilerOptions": {
"rootDir": "src",
"outDir": "dist"
},
"include": ["src"]
}
Loading