remotion-dev · D4ve-R · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/packages/transformers-js/.eslintrc b/packages/transformers-js/.eslintrc
@@ -0,0 +1,3 @@
+{
+  "extends": "@jonny"
+}
diff --git a/packages/transformers-js/README.md b/packages/transformers-js/README.md
@@ -0,0 +1,18 @@
+# @remotion/transformers-js
+
+Helpers for installing and using Whisper.cpp
+
+[![NPM Downloads](https://img.shields.io/npm/dm/@remotion/install-whisper-cpp.svg?style=flat&color=black&label=Downloads)](https://npmcharts.com/compare/@remotion/install-whisper-cpp?minimal=true)
+
+## Installation
+
+```bash
+npm install @remotion/transformers-js --save-exact
+```
+
+When installing a Remotion package, make sure to align the version of all `remotion` and `@remotion/*` packages to the same version.
+Remove the `^` character from the version number to use the exact version.
+
+## Usage
+
+See the [documentation](https://www.remotion.dev/docs/transformers-js) for more information.
diff --git a/packages/transformers-js/package.json b/packages/transformers-js/package.json
@@ -0,0 +1,40 @@
+{
+	"repository": {
+		"url": "https://github.com/remotion-dev/remotion/tree/main/packages/transformers-js"
+	},
+	"name": "@remotion/transformers-js",
+	"version": "4.0.202",
+	"description": "Generate Captions for Remotion Videos using Whisper via transformers.js",
+	"main": "dist/index.js",
+	"sideEffects": false,
+	"bugs": {
+		"url": "https://github.com/remotion-dev/remotion/issues"
+	},
+	"scripts": {
+		"formatting": "prettier src --check",
+		"lint": "eslint src --ext ts,tsx",
+		"test": "bun test src"
+	},
+	"files": [
+		"dist"
+	],
+	"author": "David Rechkemmer",
+	"license": "SEE LICENSE IN LICENSE.md",
+	"dependencies": {
+		"@xenova/transformers": "^2.17.2"
+	},
+	"peerDependencies": {
+		"react": ">=16.8.0",
+		"react-dom": ">=16.8.0"
+	},
+	"keywords": [
+		"remotion",
+		"openai",
+		"whisper",
+		"transformers.js"
+	],
+	"publishConfig": {
+		"access": "public"
+	},
+	"homepage": "https://www.remotion.dev/docs/transformers-js"
+}
diff --git a/packages/transformers-js/src/decode.ts b/packages/transformers-js/src/decode.ts
@@ -0,0 +1,36 @@
+/**
+ * Decode audio data from a file, blob, or array buffer
+ * @param data - File | Blob | ArrayBuffer
+ * @returns raw audio data
+ * @throws Error if decoding fails
+ */
+export const decodeAudioData = async (data: File | Blob | ArrayBuffer) => {
+	const audioContext = new window.AudioContext({
+		sampleRate: 16_000,
+	});
+
+	try {
+		if (data instanceof Blob || data instanceof File) {
+			data = await data.arrayBuffer();
+		}
+
+		const audioBuffer = await audioContext.decodeAudioData(data);
+		let audio: Float32Array;
+		if (audioBuffer.numberOfChannels === 2) {
+			// Merge channels
+			const SCALING_FACTOR = Math.sqrt(2);
+			const left = audioBuffer.getChannelData(0);
+			const right = audioBuffer.getChannelData(1);
+			audio = new Float32Array(left.length);
+			for (let i = 0; i < audioBuffer.length; ++i) {
+				audio[i] = (SCALING_FACTOR * (left[i] + right[i])) / 2;
+			}
+		} else {
+			audio = audioBuffer.getChannelData(0);
+		}
+
+		return audio;
+	} catch (e) {
+		throw new Error(`Failed to decode audio: ${e}`);
+	}
+};
diff --git a/packages/transformers-js/src/index.ts b/packages/transformers-js/src/index.ts
diff --git a/packages/transformers-js/src/util.ts b/packages/transformers-js/src/util.ts
@@ -0,0 +1,15 @@
+
+export async function hasWebGPU() {
+  // @ts-expect-error
+  if (!navigator.gpu) {
+    return false;
+  }
+
+  try {
+    // @ts-expect-error
+    const adapter = await navigator.gpu.requestAdapter();
+    return Boolean(adapter);
+  } catch (e) {
+    return false;
+  }
+}
diff --git a/packages/transformers-js/src/worker.ts b/packages/transformers-js/src/worker.ts
@@ -0,0 +1,137 @@
+import type {
+	AutomaticSpeechRecognitionOutput,
+	AutomaticSpeechRecognitionPipeline,
+} from '@xenova/transformers';
+import { pipeline } from '@xenova/transformers';
+
+export interface DeviceConfig {
+	dtype: string | {[key: string]: string};
+	device: string;
+}
+
+export interface MessageData {
+  device: string;
+  audio: Float32Array;
+  language: string;
+}
+
+export interface ClientMessage {
+	type: string;
+	data: string | MessageData;
+}
+
+export interface WorkerResponse {
+	status: string;
+	data?:
+		| string
+		| {
+				result:
+					| AutomaticSpeechRecognitionOutput
+					| AutomaticSpeechRecognitionOutput[];
+				time: number;
+		  };
+}
+
+const PER_DEVICE_CONFIG: {[key: string]: DeviceConfig} = {
+	webgpu: {
+		dtype: {
+			encoder_model: 'fp32',
+			decoder_model_merged: 'q4',
+		},
+		device: 'webgpu',
+	},
+	wasm: {
+		dtype: 'q8',
+		device: 'wasm',
+	},
+};
+
+/**
+ * This class uses the Singleton pattern to ensure that only one instance of the model is loaded.
+ */
+class PipelineSingeton {
+	static model_id = 'onnx-community/whisper-base_timestamped';
+	static instance: AutomaticSpeechRecognitionPipeline;
+
+	static async getInstance(
+		progress_callback: Function | undefined = undefined,
+		device = 'webgpu',
+	) {
+		if (!this.instance) {
+			this.instance = await pipeline(
+				'automatic-speech-recognition',
+				this.model_id,
+				{
+					...PER_DEVICE_CONFIG[device],
+					progress_callback,
+				},
+			);
+		}
+
+		return this.instance;
+	}
+}
+
+async function load({ device }: MessageData) {
+	self.postMessage({
+		status: 'loading',
+		data: `Loading model (${device})...`,
+	});
+
+	// Load the pipeline and save it for future use.
+	const transcriber = await PipelineSingeton.getInstance((x: MessageData) => {
+		// We also add a progress callback to the pipeline so that we can
+		// track model loading.
+		self.postMessage(x);
+	}, device);
+
+	if (device === 'webgpu') {
+		self.postMessage({
+			status: 'loading',
+			data: 'Compiling shaders and warming up model...',
+		});
+		await transcriber(new Float32Array(16_000), {
+			language: 'en',
+		});
+	}
+
+	self.postMessage({
+		status: 'ready', 
+		data: 'Model loaded successfully!'
+	});
+}
+
+async function run({audio, language}: MessageData) {
+	const transcriber = await PipelineSingeton.getInstance();
+
+	const start = performance.now();
+
+	const result = await transcriber(audio, {
+		language,
+		return_timestamps: 'word',
+		chunk_length_s: 30,
+	});
+
+	const end = performance.now();
+
+	self.postMessage({status: 'complete', data: {result, time: end - start}});
+}
+
+
+// Listen for messages from the main thread
+self.addEventListener('message', async (e: any) => {
+	const {type, data} = e.data as ClientMessage;
+
+	switch (type) {
+		case 'load':
+			load(data as MessageData);
+			break;
+
+		case 'run':
+			run(data as MessageData);
+			break;
+
+		default:
+			self.postMessage({status: 'error', data: 'Unknown message type'});
+	}
+});
diff --git a/packages/transformers-js/tsconfig.json b/packages/transformers-js/tsconfig.json
@@ -0,0 +1,8 @@
+{
+  "extends": "../tsconfig.settings.json",
+  "compilerOptions": {
+    "rootDir": "src",
+    "outDir": "dist"
+  },
+  "include": ["src"]
+}