def text_to_speech(text, output_file="output.mp3"):
url = f"https://api.minimaxi.com/v1/t2a_v2?GroupId={group_id}"
payload = json.dumps({
"model": "speech-2.5-hd-preview",
"text": text,
"stream": False,
"voice_setting": {
"voice_id": "male-qn-qingse",
"speed": 1,
"vol": 1,
"pitch": 0,
"emotion": "happy"
},
"audio_setting": {
"sample_rate": 32000,
"bitrate": 128000,
"format": "mp3",
"channel": 1
}
})
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
response = requests.post(url, headers=headers, data=payload)
parsed_json = response.json()
audio_value = bytes.fromhex(parsed_json["data"]["audio"])
with open(output_file, "wb") as f:
f.write(audio_value)
print(f"语音已保存为 {output_file}")
# ==== Demo运行 ====
if __name__ == "__main__":
explain_text = recognize_image("test.jpg") # 输入图片
print("识别讲解结果:\n", explain_text)
text_to_speech(explain_text, "output.mp3")