继续上文,硬件软件准备齐全,介绍一下主要用到的库
sherpa-onnx
开源的,语音转文本、文本转语音、说话人分类和 VAD,关键是支持C#开发
OllamaSharp
用于连接ollama,如其名C#开发
虽然离可玩还有一段距离,但是还是要说一下目前遇到的一些问题
1、最初使用流式语音识别,但录音流发送到server之后,完全识别不到文字,保存为pcm播放发现完全是噪音,最后转换放大了音频可以正确识别到说话内容了
static float gain = 5.0f;
short[] int16Array;
float[] floatArray;
public void Recognize(byte[] bytes)
{
//Console.WriteLine("收到音频长度:"+ bytes.Length);
int16Array = new short[bytes.Length / 2];
Buffer.BlockCopy(bytes, 0, int16Array, 0, bytes.Length);
floatArray = new float[int16Array.Length];
for (int i = 0; i < int16Array.Length; i++)
{
floatArray[i] = int16Array[i] / 32768.0f * gain;
}
onlineStream.AcceptWaveform(sampleRate, floatArray);
}
2、流式识别有个问题,我说下一句话的时候才会确定上一句结束,虽然有三个参数可调,但我多次测试,没调好,只好改用离线识别,给esp32s3添加个按键,按下开始发送录音数据,松开停止发送并告知录音结束,server端再开始识别
//默认值
config.Rule1MinTrailingSilence = 2.4f;
config.Rule2MinTrailingSilence = 0.5f;
//限制最长说话10秒
config.Rule3MinUtteranceLength = 10f;
void loop() {
webSocket.loop(); // 必须调用以处理WebSocket事件
if(digitalRead(PIN_BUTTON) == LOW)
{
pressed = true;
uint8_t buffer[BUFFER_SIZE];
size_t bytesRead;
// 从I2S读取音频数据
i2s_read(I2S_NUM_0, buffer, BUFFER_SIZE, &bytesRead, portMAX_DELAY);
// 通过WebSocket发送音频数据
if (webSocket.sendBIN(buffer, bytesRead)) {
//Serial.printf("Sent %d bytes of audio data\n", bytesRead);
} else {
//Serial.println("Failed to send audio data");
}
}
else
{
if(pressed)
{
pressed = false;
if (webSocket.sendTXT("{\"code\":1,\"message\":\"结束语音\"}")) {
} else {
}
}
}
}
private static void OnMessage(IWebSocketConnection connection, string msg)
{
BaseMsg baseMsg = null;
try
{
baseMsg = JsonConvert.DeserializeObject<BaseMsg>(msg);
}
catch (Exception e)
{
Console.WriteLine(e);
}
if (baseMsg != null)
{
// 收到code 1时,结束录音开始识别
if (baseMsg.code == 1)
{
Asr asr = null;
asrs.TryGetValue(connection.GetHashCode(), out asr);
if (asr != null)
{
asr.EndReceive();
}
}
}
}
3、周围人多声音嘈杂的时候,语音识别开始放飞自我,七八米开外的人声它都收进来…… 用RNNoise.Net处理了一下,也不知道对不对,感觉有点效果……
https://github.com/Yellow-Dog-Man/RNNoise.Net
/// <summary>
/// 识别语音数据
/// </summary>
short[] int16Array;
float[] floatArray;
private void Recognize(byte[] bytes)
{
int16Array = new short[bytes.Length / 2];
Buffer.BlockCopy(bytes, 0, int16Array, 0, bytes.Length);
floatArray = new float[int16Array.Length];
for (int i = 0; i < int16Array.Length; i++)
{
floatArray[i] = int16Array[i] / 32768.0f;
}
// 降噪
using (var denoiser = new Denoiser())
{
int count = denoiser.Denoise(floatArray.AsSpan());
Console.WriteLine("denoised count:" + count);
}
offlineStream = recognizer.CreateStream();
offlineStream.AcceptWaveform(sampleRate, floatArray);
recognizer.Decode(offlineStream);
string result = offlineStream.Result.Text;
offlineStream.Dispose();
Console.WriteLine("result:" + result);
if (!string.IsNullOrWhiteSpace(result))
{
result = offlinePunctuation.AddPunct(result.ToLower());
BaseMsg textMsg = new BaseMsg(1, result);
client.Send(JsonConvert.SerializeObject(textMsg));
if (llm != null)
{
llm.RequestAsync(result);
}
}
}
4、其他问题,声音卡顿,爆音,音频长了后半段直接爆炸刺啦……
放上最新截图
按键 G47接GND,按钮还没到货……
工程地址,持续修改中
https://github.com/xue-fei/homeai