跳到主要内容

Tencent Of Websocket

2024年02月19日
柏拉文
越努力,越幸运

一、认识


Tencent ASR 语音识别技术 采用 websocket 协议,对实时音频流进行识别,同步返回识别结果,达到边说边出文字的效果。

Websocket: reconnecting-websocket

Tencent ASR 文档

1.1 音频录制

请参考音频录制章节

1.2 音频采样

请参考音频采样章节

1.3 语音识别

Tencent ASR 工作流程如下:

  1. 握手阶段: 前端定义请求 url 部分参数, 发送服务端, 由服务端生成带有 signature 签名的请求 url, 拼接 wss:// 前缀, 随后根据请求 url 建立 websocket 连接, 返回 text message,内容为 json 序列化字符串

  2. 识别阶段: 发送格式为 16k 采样率的 PCM 音频数据,接收识别结果, 返回 text message,内容为 json 序列化字符串

  3. 处理结果: json 序列化字符串通过 JSON.parse() 解析后,包含 codemessagevoice_idfinalresult 等字段

    • code: 返回码,0表示成功,其他值表示失败

    • result: 最新语音识别结果

      • slice_type: 0-一段话开始识别,1-一段话识别中,2-一段话识别结束

      • voice_text_str: 当前一段话文本结果,编码为 UTF8

      • start_time: 当前一段话结果在整个音频流中的起始时间

      • end_time: 当前一段话结果在整个音频流中的结束时间

      • voice_id: 语音流唯一标识

      • word_size: 当前一段话的词结果个数

      • word_list: 当前一段话的词列表

    • final: 该字段返回1时表示音频流全部识别结束

二、实现


2.1 index.html

<div class="operation">
<button id="start-record">录制</button>
<button id="stop-record">停止</button>
</div>

<div class="asr-result-container"></div>
<div class="audio-container">
<audio id="audio" controls></audio>
</div>

<script type="module" src="./index.js"></script>

2.2 index.js

import Processor from './processor/index.js';
import Recognizer from './recognizer/index.js';

let recognizer = null;
let recordStatus = 'init';
let mediaRecorder = null;
const recordedBlobs = [];
const audio = document.getElementById('audio');
const startRecordEl = document.getElementById('start-record');
const stopRecordEl = document.getElementById('stop-record');
const asrResultContainerEl = document.querySelector('.asr-result-container');

function startRecord() {
recordStatus = 'recording';
const timeSlice = 5000;
mediaRecorder.start(timeSlice);

recognizer.start(
'wss://asr.cloud.tencent.com/asr/v2/1303248253?convert_num_mode=1&engine_model_type=16k_zh&expired=1708686737&filter_dirty=1&filter_modal=2&filter_punc=0&hotword_id=08003a00000000000000000000000000&needvad=1&nonce=17086831&secretid=AKIDdCU1KGl1nKXquwnI8j7H4dw0pulN2KRg&t=1708683136899&timestamp=1708683137&vad_silence_time=800&voice_format=1&voice_id=f2c37564-1213-43f5-b84a-2ed8de3f7484&word_info=2&signature=LyHqzm9TWuxU9DcK6cKGxxApVHM%3D'
);
}

async function stopRecord() {
recordStatus = 'stopped';
mediaRecorder.stop();
const audioBlob = new Blob(recordedBlobs);
const audioUrl = URL.createObjectURL(audioBlob);
audio.src = audioUrl;
}

function processRecord(data) {
if (recordStatus !== 'recording') {
return;
}

const { buffer } = data;
recognizer.send(buffer);
}

function recognitionResultChange(data) {
asrResultContainerEl.innerHTML = data;
}

async function prepareRecord() {
const stream = await navigator.mediaDevices.getUserMedia({
audio: true
});

recognizer = new Recognizer({
onRecognitionResultChange: recognitionResultChange
});

new Processor({ stream, processRecord });
mediaRecorder = new MediaRecorder(stream);

mediaRecorder.ondataavailable = event => {
if (event.data && event.data.size > 0) {
recordedBlobs.push(event.data);
}
};
}

async function checkPermissions(name) {
try {
return await navigator.permissions.query({ name: name });
} catch (error) {
return false;
}
}

async function run() {
const microphone = await checkPermissions('microphone');
if (microphone.state === 'granted') {
prepareRecord();
startRecordEl.addEventListener('click', startRecord);
stopRecordEl.addEventListener('click', stopRecord);
} else {
alert('请允许使用麦克风');
}
}

run();

2.3 processor/index.js

function isSupportAudioWorklet(audioContext) {
return (
audioContext.audioWorklet &&
typeof audioContext.audioWorklet.addModule === 'function' &&
typeof AudioWorkletNode !== 'undefined'
);
}

function isSupportCreateScriptProcessor(audioContext) {
return typeof audioContext.createScriptProcessor === 'function';
}

function to16kHz(audioData, sampleRate = 44100) {
const data = new Float32Array(audioData);
const fitCount = Math.round(data.length * (16000 / sampleRate));
const newData = new Float32Array(fitCount);
const springFactor = (data.length - 1) / (fitCount - 1);
newData[0] = data[0];
for (let i = 1; i < fitCount - 1; i++) {
const tmp = i * springFactor;
const before = Math.floor(tmp).toFixed();
const after = Math.ceil(tmp).toFixed();
const atPoint = tmp - before;
newData[i] = data[before] + (data[after] - data[before]) * atPoint;
}
newData[fitCount - 1] = data[data.length - 1];
return newData;
}

function to16BitPCM(input) {
const dataLength = input.length * (16 / 8);
const dataBuffer = new ArrayBuffer(dataLength);
const dataView = new DataView(dataBuffer);
let offset = 0;
for (let i = 0; i < input.length; i++, offset += 2) {
const s = Math.max(-1, Math.min(1, input[i]));
dataView.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
}
return dataView;
}

export default class Processor {
constructor(options) {
const { stream } = options;

this.options = options;
this.audioContext = new AudioContext();
this.mediaStreamSource = this.audioContext.createMediaStreamSource(stream);

this.init();
}

init() {
if (isSupportAudioWorklet(this.audioContext)) {
this.audioWorkletNodeDealAudioData();
} else {
this.scriptNodeDealAudioData();
}
}

scriptNodeDealAudioData() {
if (!isSupportCreateScriptProcessor(this.audioContext)) {
return;
}

try {
const scriptProcessor = this.audioContext.createScriptProcessor(
1024,
1,
1
);
this.mediaStreamSource.connect(scriptProcessor);
scriptProcessor.connect(this.audioContext.destination);

scriptProcessor.onaudioprocess = event => {
const samples = event.inputBuffer.getChannelData(0);
const output = to16kHz(samples);
const audioBuffer = to16BitPCM(output);

const data = {
buffer: audioBuffer
};

this.options.processRecord?.(data);
};

} catch (e) {
console.log('scriptNodeDealAudioData 错误原因:', e);
}
}

async audioWorkletNodeDealAudioData() {
if (!isSupportAudioWorklet(this.audioContext)) {
return;
}

try {
await this.audioContext.audioWorklet.addModule('http://127.0.0.1:5502/test/javascript/audioRecord/022301/processor/custom-processor.js');

const customNode = new AudioWorkletNode(
this.audioContext,
'custom-processor'
);

this.mediaStreamSource
.connect(customNode)
.connect(this.audioContext.destination);

customNode.port.onmessage = event => {
const { audioBuffer } = event.data;
const data = {
buffer: audioBuffer
};

this.options.processRecord?.(data);
};
} catch (e) {
console.log('audioWorkletNodeDealAudioData 错误原因:', e);
}
}
}

2.4 processor/custom-processor.js

function to16kHz(audioData, sampleRate = 44100) {
const data = new Float32Array(audioData);
const fitCount = Math.round(data.length * (16000 / sampleRate));
const newData = new Float32Array(fitCount);
const springFactor = (data.length - 1) / (fitCount - 1);
newData[0] = data[0];
for (let i = 1; i < fitCount - 1; i++) {
const tmp = i * springFactor;
const before = Math.floor(tmp).toFixed();
const after = Math.ceil(tmp).toFixed();
const atPoint = tmp - before;
newData[i] = data[before] + (data[after] - data[before]) * atPoint;
}
newData[fitCount - 1] = data[data.length - 1];
return newData;
}

function to16BitPCM(input) {
const dataLength = input.length * (16 / 8);
const dataBuffer = new ArrayBuffer(dataLength);
const dataView = new DataView(dataBuffer);
let offset = 0;
for (let i = 0; i < input.length; i++, offset += 2) {
const s = Math.max(-1, Math.min(1, input[i]));
dataView.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
}
return dataView;
}

class CustomProcessor extends AudioWorkletProcessor {
constructor(options) {
super(options);
}

process(inputs) {
const input = inputs[0];
if (!input || input.length === 0) {
return;
}

const samples = input[0];
const output = to16kHz(samples);
const audioBuffer = to16BitPCM(output);
this.port.postMessage({ audioBuffer });

return true;
}
}

registerProcessor('custom-processor', CustomProcessor);

2.5 recognizer/index.js

export default class Recognizer {
constructor(options) {
this.socket = null;
this.options = options;

// 是否鉴权成功
this.isSignSuccess = false;
// 是否一句话开始
this.isSentenceBegin = false;
// 当前是否识别结束
this.isRecognizeComplete = false;
}

async start(url) {
this.socket = new WebSocket(url);

this.socket.onmessage = e => {
const response = JSON.parse(e.data);

if (response.code !== 0) {
console.log('Recognizer start error: ', response.message);
this.socket.close();
return;
}

if (!this.isSignSuccess) {
this.onRecognitionStart(response);
this.isSignSuccess = true;
}

if (response.final === 1) {
this.isRecognizeComplete = true;
this.onRecognitionComplete(response);
return;
}

if (response.result) {
const result = {
...response.result,
voice_id: response.voice_id
};

if (response.result.slice_type === 0) {
this.onRecognitionSentenceBegin(result);
this.isSentenceBegin = true;
} else if (response.result.slice_type === 2) {
if (!this.isSentenceBegin) {
this.onRecognitionSentenceBegin(result);
}
this.onRecognitionSentenceEnd(result);
} else {
this.onRecognitionResultChange(result);
}
}
};

this.socket.onerror = e => {
this.socket.close();
this.onError(e);
};

this.socket.onclose = event => {
if (!this.isRecognizeComplete) {
this.onError(event);
}
};
}

send(data) {
if (!this.socket || this.socket.readyState !== 1) {
return;
}
this.socket.send(data);
}

// 识别失败
onError(error) {
console.log('onError error', error);
}

// 开始识别的时候
onRecognitionStart(result) {
console.log('onRecognitionStart result', result);
}

// 一句话开始的时候
onRecognitionSentenceBegin(result) {
console.log('onRecognitionSentenceBegin res', result);
}

// 识别结果发生变化的时候
onRecognitionResultChange(result) {
const { voice_text_str } = result;
this.options.onRecognitionResultChange?.(voice_text_str);
}

// 一句话结束的时候
onRecognitionSentenceEnd(res) {
console.log('onRecognitionSentenceEnd res', res);
}

// 识别结束的时候
onRecognitionComplete(response) {
console.log('onRecognitionComplete response', response);
}
}

2.6 recognizer/url.js

function guid() {
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, (c) => {
const r = (Math.random() * 16) | 0;
const v = c === 'x' ? r : (r & 0x3) | 0x8;
return v.toString(16);
});
}

function getServerTime() {
return new Promise((resolve, reject) => {
try {
const xhr = new XMLHttpRequest();
xhr.open('GET', 'https://asr.cloud.tencent.com/server_time', true);
xhr.send();
xhr.onreadystatechange = () => {
if (xhr.readyState === 4 && xhr.status === 200) {
resolve(xhr.responseText);
}
};
} catch (error) {
reject('get tx server time error');
}
});
}

async function createTencentASRAuthParams() {
const time = new Date().getTime();
const serverTime = await getServerTime();

const params = {
engine_model_type: '16k_zh', // 引擎模型类型 16k_zh:中文通用模型
timestamp: parseInt(serverTime, 10) || Math.round(time / 1000), // 当前 UNIX 时间戳,单位为秒。如果与当前时间相差过大,会引起签名过期错误
expired: Math.round(time / 1000) + 1 * 60 * 60, // 签名的有效期截止时间 UNIX 时间戳,单位为秒。expired 必须大于 timestamp 且 expired - timestamp 小于90天
nonce: Math.round(time / 100000), // 随机正整数。用户需自行生成,最长10位
voice_id: guid(), // 音频流识别全局唯一标识,一个 websocket 连接对应一个,用户自己生成(推荐使用 uuid),最长128位。
voice_format: 1, // 语音编码方式,可选,默认值为4。1:pcm;4:speex(sp);6:silk;8:mp3;10:opus(opus 格式音频流封装说明);12:wav;14:m4a(每个分片须是一个完整的 m4a 音频);16:aac
hotword_id: '08003a00000000000000000000000000', // 热词表 id。如不设置该参数,自动生效默认热词表;如果设置了该参数,那么将生效对应的热词表
needvad: 1, // 0:关闭 vad,1:开启 vad,默认为0。如果语音分片长度超过60秒,用户需开启 vad(人声检测切分功能)
vad_silence_time: 0.8 * 1000, // 语音断句检测阈值,静音时长超过该阈值会被认为断句(多用在智能客服场景,需配合 needvad = 1 使用),取值范围:240-2000(默认1000),单位 ms,此参数建议不要随意调整,可能会影响识别效果,目前仅支持 8k_zh、8k_zh_finance、16k_zh 引擎模型
filter_dirty: 1, // 是否过滤脏词(目前支持中文普通话引擎)。默认为0。0:不过滤脏词;1:过滤脏词;2:将脏词替换为“ * ”
filter_modal: 2, // 是否过滤语气词(目前支持中文普通话引擎)。默认为0。0:不过滤语气词;1:部分过滤;2:严格过滤
filter_punc: 0, // 是否过滤句末的句号(目前支持中文普通话引擎)。默认为0。0:不过滤句末的句号;1:过滤句末的句号
convert_num_mode: 1, // 是否进行阿拉伯数字智能转换(目前支持中文普通话引擎)。0:不转换,直接输出中文数字,1:根据场景智能转换为阿拉伯数字,3: 打开数学相关数字转换。默认值为1
word_info: 2, // 是否显示词级别时间戳。0:不显示;1:显示,不包含标点时间戳,2:显示,包含标点时间戳。支持引擎 8k_en、8k_zh、8k_zh_finance、16k_zh、16k_en、16k_ca、16k_zh-TW、16k_ja、16k_wuu-SH,默认为0
};

return params;
}

/**
* @description: 获取具有腾讯 ASR 鉴权签名的 ASRUrl(一般放在服务端实现)
* 文档: https://cloud.tencent.com/document/product/1093/48982
* 1. 对除 signature 之外的所有参数按字典序进行排序,拼接请求 URL (不包含协议部分:wss://)作为签名原文
* 2. 对签名原文使用 SecretKey 进行 HmacSha1 加密,之后再进行 base64 编码
* 3. 得到 signature 签名, 进行 urlencode 编码
* 4. 将签名参数添加到请求 URL 中
*/
async function getTencentAsrAuthSignatureURL() {
// ……
// return result.data.url;
}

async function getTencentASRUrl() {
const signatureParams = await createTencentASRAuthParams();
const asrUrl = await getTencentAsrAuthSignatureURL(signatureParams);
return asrUrl; // 格式为: wss://asr.cloud.tencent.com/asr/v2/1303248253?convert_num_mode=1&engine_model_type=16k_zh&expired=1708686737&filter_dirty=1&filter_modal=2&filter_punc=0&hotword_id=08003a00000000000000000000000000&needvad=1&nonce=17086831&secretid=xxxx&t=1708683136899&timestamp=1708683137&vad_silence_time=800&voice_format=1&voice_id=f2c37564-1213-43f5-b84a-2ed8de3f7484&word_info=2&signature=xxx
}