语音指令分类模型训练(基于机器学习方法)

张开发
2026/4/14 11:08:29 15 分钟阅读

分享文章

语音指令分类模型训练(基于机器学习方法)
1、统计音频长度信息便于后续参数的设定import os import librosa import numpy as np # 配置参数 DATA_PATH data4c # 数据集根目录 FIXED_SAMPLE_RATE 16000 def stat_audio_lengths(): # 存储所有音频的长度采样点数和时长秒 all_lengths [] all_durations [] # 遍历所有类别文件夹 labels os.listdir(DATA_PATH) for label in labels: label_path os.path.join(DATA_PATH, label) if not os.path.isdir(label_path): continue # 遍历文件夹下所有音频文件 for fname in os.listdir(label_path): fpath os.path.join(label_path, fname) try: # 读取音频 y, sr librosa.load(fpath, srFIXED_SAMPLE_RATE) length len(y) # 长度采样点数 duration length / sr # 时长秒 all_lengths.append(length) all_durations.append(duration) # 打印单个文件信息 print(f{fpath} - 采样点数: {length}, 时长: {duration:.2f}s) except Exception as e: print(f读取失败: {fpath} - {e}) # 计算统计指标 if all_lengths: print(f总文件数: {len(all_lengths)}) print( f采样点数 - 最大值: {np.max(all_lengths)}, 最小值: {np.min(all_lengths)}, 平均值: {np.mean(all_lengths):.0f}) print( f时长 - 最大值: {np.max(all_durations):.2f}s, 最小值: {np.min(all_durations):.2f}s, 平均值: {np.mean(all_durations):.2f}s) else: print(没有读取任何音频文件) # 运行统计 if __name__ __main__: stat_audio_lengths()2、加载数据集提取mfcc特征并特征对齐import os import librosa import numpy as np from sklearn.model_selection import train_test_split FIXED_SAMPLE_RATE 16000 # 统一采样率16000Hz语音标准 MAX_LEN 36000 # 采样点 N_MFCC 13 # 特征数,通常取13维 # 加载数据集(用于机器学习模型分类) # X_2d:(samples, n_mfcc_feature), y_1d:(samples,) def load_data4ml(data_pt): X, y [], [] labels os.listdir(data_pt) f open(result/label.txt, w) for label in labels: folder os.path.join(data_pt, label) f.write(str(label) label \n) for fname in os.listdir(folder): fpath os.path.join(folder, fname) # y音频时域信号 sr采样率 y_audio, sr librosa.load(fpath, srFIXED_SAMPLE_RATE) # 提取MFCC特征 mfccs librosa.feature.mfcc( yy_audio, srsr, n_mfccN_MFCC, # 提取13维MFCC n_fft512, # FFT窗长 hop_length256, # 帧移 n_mels 40 # 梅尔滤波器组数量 ) # 统一帧数,末尾补零 / 截断尾部 n_mfcc, current_frames mfccs.shape if current_frames MAX_LEN: # 过长: 截断尾部 mfccs_fixed mfccs[:, :MAX_LEN] else: # 过短: 在末尾补零 pad_length MAX_LEN - current_frames mfccs_fixed np.pad(mfccs, ((0, 0), (0, pad_length)), modeconstant) # 特征归一化(特征维度N_MFCC, 帧数MAX_LEN) mfcc (mfccs_fixed - np.mean(mfccs_fixed,axis1,keepdimsTrue))/(np.std(mfccs_fixed,axis1,keepdimsTrue)1e-8) # 一阶差分 二阶差分, 拼接成 39 维特征 delta_mfcc librosa.feature.delta(mfcc) delta2_mfcc librosa.feature.delta(mfcc, order2) mfcc_39d np.concatenate([mfcc, delta_mfcc, delta2_mfcc], axis0) feature mfcc_39d.flatten() # 2d特征--1d特征 X.append(feature) y.append(label) X np.array(X) y np.array(y) X_train, X_test, y_train, y_test train_test_split(X, y) print(训练样本及标签:,X_train.shape,y_train.shape) print(测试样本及标签:,X_test.shape,y_test.shape) return X_train, X_test, y_train, y_test3、使用机器学习算法训练语音分类模型import joblib from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from read_data import load_data4ml def train(data_path): X_train, X_test, y_train, y_test load_data4ml(data_path) print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) # 训练KNN模型,适合指令识别 # model KNeighborsClassifier(n_neighbors5) # 支持向量机 # model SVC() # 多层感知机 model MLPClassifier(hidden_layer_sizes(200,100),activationrelu) model.fit(X_train, y_train) # 测试准确率 acc model.score(X_test, y_test) print(f训练完成\n测试准确率{acc * 100:.2f}%) # 保存模型 if acc0.85: joblib.dump(model, result/mlp_model.pkl) print(模型已保存result/mlp_model.pkl) if __name__ __main__: train(data_pathdata4c)4、使用训练好的模型进行测试import joblib import librosa import numpy as np from read_data import FIXED_SAMPLE_RATE, MAX_LEN, N_MFCC def process_data(fpt): # y音频时域信号 sr采样率 y_audio, sr librosa.load(fpt, srFIXED_SAMPLE_RATE) # 提取MFCC特征 mfccs librosa.feature.mfcc( yy_audio, srsr, n_mfccN_MFCC, # 提取13维MFCC n_fft512, # FFT窗长 hop_length256, # 帧移 n_mels40 # 梅尔滤波器组数量 ) # 统一帧数,末尾补零 / 截断尾部 n_mfcc, current_frames mfccs.shape if current_frames MAX_LEN: mfccs_fixed mfccs[:, :MAX_LEN] else: pad_length MAX_LEN - current_frames mfccs_fixed np.pad(mfccs, ((0, 0), (0, pad_length)), modeconstant) # 特征归一化(特征维度N_MFCC, 帧数MAX_LEN) mfcc (mfccs_fixed - np.mean(mfccs_fixed, axis1, keepdimsTrue)) / ( np.std(mfccs_fixed, axis1, keepdimsTrue) 1e-8) # 一阶差分 二阶差分, 拼接成 39 维特征 delta_mfcc librosa.feature.delta(mfcc) delta2_mfcc librosa.feature.delta(mfcc, order2) mfcc_39d np.concatenate([mfcc, delta_mfcc, delta2_mfcc], axis0) feature mfcc_39d.flatten() # 2d特征--1d特征 X np.array([feature]) return X def predict(file): model joblib.load(result/mlp_model.pkl) X process_data(file) lb file.split(/)[-1].split(_)[0] pred model.predict(X) print(pred) flg 正确 if pred[0]lb else 错误 print(f识别{flg}\t真实标签:{lb}\t识别结果{pred[0]}\t) return pred[0] if __name__ __main__: # 测试样本 file test_data/3_1774506934932.wav predict(file)

更多文章