diff --git a/index.d.ts b/index.d.ts new file mode 100644 index 0000000..d995b1d --- /dev/null +++ b/index.d.ts @@ -0,0 +1,136 @@ +/// +import { Transform, TransformOptions } from 'node:stream'; + +declare class VAD { + /** + * Creates a new Voice Activity Detection instance + * @param mode VAD sensitivity mode. Default is Mode.NORMAL if omitted. + */ + constructor(mode: VAD.Mode); + + /** + * Analyse the given samples and notify the detected voice event via promise. + * Note: Sample buffers should be rather short (36ms to 144ms) and the sample rate + * no higher than 32kHz for best results. Sample rates higher than 16kHz provide + * no benefit to the VAD algorithm, as human voice patterns center around 4000 to 6000Hz. + * @param samples Buffer object containing 16bit signed values + * @param sampleRate Audio sample rate. + * @returns Promise resolving to a VAD.Event code + */ + processAudio(samples: Buffer, sampleRate: VAD.SampleRate): Promise; + + /** + * Analyse the given samples and notify the detected voice event via promise. + * Note: Sample buffers should be rather short (36ms to 144ms) and the sample rate + * no higher than 32kHz for best results. Sample rates higher than 16kHz provide + * no benefit to the VAD algorithm, as human voice patterns center around 4000 to 6000Hz. + * @param samples Buffer object containing 32bit normalized float values + * @param sampleRate Audio sample rate. + * @returns Promise resolving to a VAD.Event code + */ + processAudioFloat(samples: Buffer, sampleRate: VAD.SampleRate): Promise; + + /** + * Creates a new VAD stream for continuous audio processing + * @param opts Stream options + */ + static createStream(opts?: VAD.StreamOptions): VAD.VADStream; + + /** + * Converts a 16-bit signed audio buffer to float format + * @param buffer Input buffer + * @returns Float buffer + */ + static toFloatBuffer(buffer: Buffer): Buffer; +} + +declare namespace VAD { + /** Valid sample rates for audio processing (in Hz). 16000Hz recommended for best performance/accuracy tradeoff. */ + export type SampleRate = 8000 | 16000 | 32000 | 48000; + + export enum Event { + /** Constant for voice detection errors */ + ERROR = -1, + /** Constant for voice detection results with no detected voices */ + SILENCE = 0, + /** Constant for voice detection results with detected voice */ + VOICE = 1, + /** Constant for voice detection results with detected noise (Not implemented yet) */ + NOISE = 2 + } + + export enum Mode { + /** + * Normal voice detection mode. Suitable for high bitrate, low-noise data. + * May classify noise as voice, too. The default value if mode is omitted in the constructor. + */ + NORMAL = 0, + /** Detection mode optimised for low-bitrate audio */ + LOW_BITRATE = 1, + /** Detection mode best suited for somewhat noisy, lower quality audio */ + AGGRESSIVE = 2, + /** Detection mode with lowest miss-rate. Works well for most inputs */ + VERY_AGGRESSIVE = 3 + } + + export interface StreamOptions { + /** + * VAD sensitivity mode + * @default Mode.NORMAL + */ + mode?: Mode; + + /** + * Audio sample rate in Hz + * @default 16000 + */ + audioFrequency?: SampleRate; + + /** + * Time in milliseconds to wait before marking the end of speech + * @default 1000 + */ + debounceTime?: number; + } + + export interface SpeechData { + /** Current state of speech */ + state: boolean; + /** True on chunk when speech starts */ + start: boolean; + /** True on chunk when speech ends */ + end: boolean; + /** Time when speech started */ + startTime: number; + /** Duration of current speech block */ + duration: number; + } + + export interface StreamOutput { + /** Current seek time in audio */ + time: number; + /** Original audio data */ + audioData: Buffer; + /** Speech detection information */ + speech: SpeechData; + } + + export class VADStream extends Transform { + constructor(options?: StreamOptions & TransformOptions); + + /** Current VAD instance */ + vad: VAD; + /** Audio sample rate */ + audioFrequency: number; + /** Debounce time in milliseconds */ + debounceTime: number; + /** Current speech state */ + state: boolean; + /** Time when speech started */ + startTime: number; + /** Time of last detected speech */ + lastSpeech: number; + } +} + +export = VAD; diff --git a/package.json b/package.json index ec074f4..f4cc132 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,7 @@ "release-major": "npm version major && npm publish" }, "main": "./index.js", + "types": "./index.d.ts", "license": "MIT", "engines": { "node": ">=6.14.3",