目录
一、总体介绍
二、环境设置与安装
三、语音识别
四、语音合成
五、GPT调用
六、信息关系
七、实现效果
一、总体介绍
达成效果:在ROS系统中实现用户语音提问得到智能语音回答
用到的技术:科大讯飞的语音识别、语音合成
GPT
操作系统:ubuntu20.04
二、环境设置与安装
(1)安装ROS
用鱼香ROS一键安装
wget http://fishros.com/install -O fishros && . fishros
(2)安装依赖包
使用python调用ChatGPT的API,依赖于python中的openai
库,安装openai库的命令
pip install openai
三、语音识别
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <termio.h>
#include "qisr.h"
#include "msp_cmn.h"
#include "msp_errors.h"
#include "speech_recognizer.h"
#include <iconv.h>
#include "ros/ros.h"
#include "std_msgs/String.h"
#define FRAME_LEN 640
#define BUFFER_SIZE 4096
int wakeupFlag = 0 ;
int resultFlag = 0 ;
static void show_result(char *string, char is_over)
{
resultFlag=1;
printf("\rResult: [ %s ]", string);
if(is_over)
putchar('\n');
}
static char *g_result = NULL;
static unsigned int g_buffersize = BUFFER_SIZE;
void on_result(const char *result, char is_last)
{
if (result) {
size_t left = g_buffersize - 1 - strlen(g_result);
size_t size = strlen(result);
if (left < size) {
g_result = (char*)realloc(g_result, g_buffersize + BUFFER_SIZE);
if (g_result)
g_buffersize += BUFFER_SIZE;
else {
printf("mem alloc failed\n");
return;
}
}
strncat(g_result, result, size);
show_result(g_result, is_last);
}
}
void on_speech_begin()
{
if (g_result)
{
free(g_result);
}
g_result = (char*)malloc(BUFFER_SIZE);
g_buffersize = BUFFER_SIZE;
memset(g_result, 0, g_buffersize);
printf("Start Listening...\n");
printf("Press \"Space\" key Stop\n");
}
void on_speech_end(int reason)
{
if (reason == END_REASON_VAD_DETECT)
printf("\nSpeaking done \n");
else
printf("\nRecognizer error %d\n", reason);
}
/* demo recognize the audio from microphone */
static void demo_mic(const char* session_begin_params)
{
int errcode;
int i = 0;
struct speech_rec iat;
struct speech_rec_notifier recnotifier = {
on_result,
on_speech_begin,
on_speech_end
};
errcode = sr_init(&iat, session_begin_params, SR_MIC, &recnotifier);
if (errcode) {
printf("speech recognizer init failed\n");
return;
}
errcode = sr_start_listening(&iat);
if (errcode) {
printf("start listen failed %d\n", errcode);
}
/* demo 10 seconds recording */
// while(i++ < 10)
// sleep(1);
int ch;
while(1){
ch = getchar();
if(ch == 32){
printf("\nSpeaking done \n");
break;
}
}
errcode = sr_stop_listening(&iat);
if (errcode) {
printf("stop listening failed %d\n", errcode);
}
sr_uninit(&iat);
}
int main(int argc, char* argv[])
{
ros::init(argc, argv, "iFlyAutoTransform");
ros::NodeHandle n;
ros::Rate loop_rate(10);
ros::Publisher iat_text_pub = n.advertise<std_msgs::String>("iat_text", 1000);
termios tms_old, tms_new;
tcgetattr(0, &tms_old);
tms_new = tms_old;
tms_new.c_lflag &= ~(ICANON | ECHO);
tcsetattr(0, TCSANOW, &tms_new);
ROS_INFO("Press \"Space\" key to Start,Press \"Enter\" key to Exit.");
int count=0;
int ch;
while(ros::ok())
{
ch = getchar();
printf("Pressed Key Value %d\n",ch);
if(ch == 32){ //Space key
wakeupFlag = 1;
}
if(ch == 10){ //Enter key
ROS_INFO("Node Exit.");
break;
}
if (wakeupFlag){
int ret = MSP_SUCCESS;
/* login params, please do keep the appid correct */
const char* login_params = "appid = 你的id, work_dir = ."; //appid need match with you SDK file
const char* session_begin_params =
"sub = iat, domain = iat, language = zh_cn, "
"accent = mandarin, sample_rate = 16000, "
"result_type = plain, result_encoding = utf8";
ret = MSPLogin(NULL, NULL, login_params);
if(MSP_SUCCESS != ret){
MSPLogout();
printf("MSPLogin failed , Error code %d.\n",ret);
}
printf("Demo recognizing the speech from microphone\n");
// printf("Speak in 10 seconds\n");
demo_mic(session_begin_params);
// printf("10 sec passed\n");
wakeupFlag=0;
MSPLogout();
}
// 语音识别完成
if(resultFlag){
resultFlag=0;
std_msgs::String msg;
msg.data = g_result;
iat_text_pub.publish(msg);
}
ROS_INFO("Press \"Space\" key to Start,Press \"Enter\" key to Exit.");
ros::spinOnce();
loop_rate.sleep();
count++;
}
exit:
tcsetattr(0, TCSANOW, &tms_old);
MSPLogout(); // Logout...
return 0;
}
四、语音合成
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include "qtts.h"
#include "msp_cmn.h"
#include "msp_errors.h"
#include "ros/ros.h"
#include "std_msgs/String.h"
#include <sstream>
#include <sys/types.h>
#include <sys/stat.h>
/* wav音频头部格式 */
typedef struct _wave_pcm_hdr
{
char riff[4]; // = "RIFF"
int size_8; // = FileSize - 8
char wave[4]; // = "WAVE"
char fmt[4]; // = "fmt "
int fmt_size; // = 下一个结构体的大小 : 16
short int format_tag; // = PCM : 1
short int channels; // = 通道数 : 1
int samples_per_sec; // = 采样率 : 8000 | 6000 | 11025 | 16000
int avg_bytes_per_sec; // = 每秒字节数 : samples_per_sec * bits_per_sample / 8
short int block_align; // = 每采样点字节数 : wBitsPerSample / 8
short int bits_per_sample; // = 量化比特数: 8 | 16
char data[4]; // = "data";
int data_size; // = 纯数据长度 : FileSize - 44
} wave_pcm_hdr;
/* 默认wav音频头部数据 */
wave_pcm_hdr default_wav_hdr =
{
{ 'R', 'I', 'F', 'F' },
0,
{'W', 'A', 'V', 'E'},
{'f', 'm', 't', ' '},
16,
1,
1,
16000,
32000,
2,
16,
{'d', 'a', 't', 'a'},
0
};
/* 文本合成 */
int text_to_speech(const char* src_text, const char* des_path, const char* params)
{
int ret = -1;
FILE* fp = NULL;
const char* sessionID = NULL;
unsigned int audio_len = 0;
wave_pcm_hdr wav_hdr = default_wav_hdr;
int synth_status = MSP_TTS_FLAG_STILL_HAVE_DATA;
if (NULL == src_text || NULL == des_path)
{
printf("params is error!\n");
return ret;
}
fp = fopen(des_path, "wb");
if (NULL == fp)
{
printf("open %s error.\n", des_path);
return ret;
}
/* 开始合成 */
sessionID = QTTSSessionBegin(params, &ret);
if (MSP_SUCCESS != ret)
{
printf("QTTSSessionBegin failed, error code: %d.\n", ret);
fclose(fp);
return ret;
}
ret = QTTSTextPut(sessionID, src_text, (unsigned int)strlen(src_text), NULL);
if (MSP_SUCCESS != ret)
{
printf("QTTSTextPut failed, error code: %d.\n",ret);
QTTSSessionEnd(sessionID, "TextPutError");
fclose(fp);
return ret;
}
printf("正在合成 ...\n");
fwrite(&wav_hdr, sizeof(wav_hdr) ,1, fp); //添加wav音频头,使用采样率为16000
while (1)
{
/* 获取合成音频 */
const void* data = QTTSAudioGet(sessionID, &audio_len, &synth_status, &ret);
if (MSP_SUCCESS != ret)
break;
if (NULL != data)
{
fwrite(data, audio_len, 1, fp);
wav_hdr.data_size += audio_len; //计算data_size大小
}
if (MSP_TTS_FLAG_DATA_END == synth_status)
break;
printf(">");
usleep(150*1000); //防止频繁占用CPU
}//合成状态synth_status取值请参阅《讯飞语音云API文档》
printf("\n");
if (MSP_SUCCESS != ret)
{
printf("QTTSAudioGet failed, error code: %d.\n",ret);
QTTSSessionEnd(sessionID, "AudioGetError");
fclose(fp);
return ret;
}
/* 修正wav文件头数据的大小 */
wav_hdr.size_8 += wav_hdr.data_size + (sizeof(wav_hdr) - 8);
/* 将修正过的数据写回文件头部,音频文件为wav格式 */
fseek(fp, 4, 0);
fwrite(&wav_hdr.size_8,sizeof(wav_hdr.size_8), 1, fp); //写入size_8的值
fseek(fp, 40, 0); //将文件指针偏移到存储data_size值的位置
fwrite(&wav_hdr.data_size,sizeof(wav_hdr.data_size), 1, fp); //写入data_size的值
fclose(fp);
fp = NULL;
/* 合成完毕 */
ret = QTTSSessionEnd(sessionID, "Normal");
if (MSP_SUCCESS != ret)
{
printf("QTTSSessionEnd failed, error code: %d.\n",ret);
}
return ret;
}
void ttsCallback(const std_msgs::String::ConstPtr& msg)
{
char cmd[2000];
const char* text;
int ret = MSP_SUCCESS;
const char* session_begin_params = "voice_name = x4_twcn_ziwen_assist, text_encoding = utf8, sample_rate = 16000, speed = 50, volume = 50, pitch = 50, rdn = 2";
const char* filename = "tts_sample.wav"; //合成的语音文件名称
std::cout<<"I heard :"<<msg->data.c_str()<<std::endl;
text = msg->data.c_str();
/* 文本合成 */
printf("开始合成 ...\n");
ret = text_to_speech(text, filename, session_begin_params);
if (MSP_SUCCESS != ret)
{
printf("text_to_speech failed, error code: %d.\n", ret);
}
printf("合成完毕\n");
unlink("/tmp/cmd");
mkfifo("/tmp/cmd", 0777);
popen("mplayer -quiet -slave -input file=/tmp/cmd 'tts_sample.wav'","r");
sleep(3);
}
void toExit()
{
printf("按任意键退出 ...\n");
getchar();
MSPLogout(); //退出登录
}
int main(int argc, char* argv[])
{
int ret = MSP_SUCCESS;
const char* login_params = "appid = 你的id, work_dir = ."; //appid need match with you SDK file
/*
* rdn: 合成音频数字发音方式
* volume: 合成音频的音量
* pitch: 合成音频的音调
* speed: 合成音频对应的语速
* voice_name: 合成发音人
* sample_rate: 合成音频采样率
* text_encoding: 合成文本编码格式
*
* 详细参数说明请参阅《讯飞语音云MSC--API文档》
*/
/* 用户登录 */
ret = MSPLogin(NULL, NULL, login_params);//第一个参数是用户名,第二个参数是密码,第三个参数是登录参数,用户名和密码可在http://open.voicecloud.cn注册获取
if (MSP_SUCCESS != ret)
{
printf("MSPLogin failed, error code: %d.\n", ret);
/*goto exit ;*///登录失败,退出登录
toExit();
}
ros::init(argc,argv,"TextToSpeech");
ros::NodeHandle n;
ros::Subscriber tts_text_pub =n.subscribe("gpt_reply_to_user", 1000,ttsCallback);
ros::spin();
exit:
MSPLogout(); //退出登录
return 0;
}
五、GPT调用
#!/usr/bin/env python3
import rospy
from std_msgs.msg import String
from openai import OpenAI
# 初始化OpenAI客户端
client = OpenAI(
api_key="your api key here!!"
)
def user_message_callback(data):
rospy.loginfo("Received from user: %s", data.data)
# 向GPT发送请求,并获取回复
chat_completion = client.chat.completions.create(
messages = [
{
"role":"user",
"content":data.data
},
],
model="gpt-3.5-turbo"
)
# 获取GPT的回复
gpt_reply = chat_completion.choices[0].message.content
rospy.loginfo("GPT Reply: %s", gpt_reply)
# 发布GPT的回复
gpt_reply_pub.publish(gpt_reply)
if __name__ == '__main__':
try:
rospy.init_node('chatgpt_ros_node', anonymous=True)
# 订阅用户消息
rospy.Subscriber("iat_text", String, user_message_callback)
# 创建发布者,用于发布GPT的回复
gpt_reply_pub = rospy.Publisher("gpt_reply_to_user", String, queue_size=10)
rospy.spin()
except rospy.ROSInterruptException:
pass
六、信息关系
七、实现效果
运行步骤
第一个终端
roscore
第二个终端
rosrun robot_voice chatgpt_communication.py
第三个终端
rosrun robot_voice tts_subscribe
第四个终端
rosrun robot_voice iat_publish
在第四个终端根据提示提问就行了