Stepfun Realtime API 阶跃星辰实时语音

1. 简介

本项目是阶跃星辰 Realtime API SDK，让您能够轻松地将语音功能集成到应用程序中，实现流畅的语音对话体验。

当前版本仅支持 Node.js，不支持浏览器使用。

2. 快速开始

2.1 文本对话示例

这是最简单的实现方式 - 无需输入，AI 自动进行语音输出。

2.1.1 安装依赖

npm init -y
npm install stepfun-realtime-api

2.1.2 示例代码

创建 main.js 文件，内容如下：

const { RealtimeClient, ServerEventType } = require("stepfun-realtime-api");

async function simpleChat() {
  // 创建客户端
  const client = new RealtimeClient({
    url: "wss://api.stepfun.com/v1/realtime",
    secret: process.env.STEP_SECRET, // 设置环境变量 STEP_SECRET
  });

  // 连接到服务器
  await client.connect();
  console.log("✅ 已连接到 Stepfun Realtime API");

  client.updateSession({
    instructions: "你是一个友好的聊天助手，回答用户的问题。",
  });

  // 监听AI的回复
  client.on(ServerEventType.ResponseContentPartDone, (event) => {
    if (event.part.type === "audio") {
      console.log("🤖 AI回复:", event.part.transcript);
      client.disconnect();
    }
  });

  // 发送消息
  await client.createResponse();
}

// 运行示例
simpleChat().catch(console.error);

2.1.3 运行程序

设置环境变量并运行：

export STEP_SECRET=your_api_key_here
node main.js

2.2 语音对话示例

本示例实现真正的语音交互：

语音检测模式： 服务端 VAD（Voice Activity Detection）
输入方式： 本地麦克风采集
输出方式： 扬声器播放 AI 回复

2.2.1 平台特定要求

音频捕获功能需要根据操作系统安装相应的依赖：

Linux 系统（包括 Raspbian）

sudo apt-get update
sudo apt-get install alsa-utils

macOS 系统

brew install sox

Windows 系统

方式 1：从官方网站下载安装
方式 2：使用 Chocolatey 安装

choco install sox

2.2.2 验证安装

测试录音功能以确保设备正常工作：

Linux 系统：

arecord --duration=5 test.wav

macOS/Windows 系统：

sox -d -t wav test.wav trim 0 5

⚠️ 注意： 运行示例前，请确保麦克风已正确连接并被系统识别。

2.2.3 代码示例

const Speaker = require("speaker");
const Mic = require("mic-ts").default;
const { RealtimeClient, ServerEventType } = require("stepfun-realtime-api");

async function voiceChat() {
  const client = new RealtimeClient({
    url: "wss://api.stepfun.com/v1/realtime",
    secret: process.env.STEP_SECRET,
  });

  // 配置音频参数
  client.updateSession({
    turn_detection: { type: "server_vad" }, // 自动服务器人声检测
    instructions: "你是一个友好的AI助手，请用简洁的语言回答问题。",
  });

  await client.connect();
  console.log("🎤 语音对话已开始，开始说话吧...");

  // 设置麦克风
  const mic = Mic({ fileType: "wavpcm", rate: "24000", channels: "1" });
  const stream = mic.getAudioStream();

  let audioBuffer = Buffer.alloc(0);
  mic.start();

  stream.on("data", (data) => {
    audioBuffer = Buffer.concat([audioBuffer, data]);
    if (audioBuffer.length >= 8192 * 2) {
      client.appendInputAudio(audioBuffer);
      audioBuffer = Buffer.alloc(0);
    }
  });

  // 设置扬声器播放AI回复
  let speaker = new Speaker({ channels: 1, bitDepth: 16, sampleRate: 24000 });

  client.on(ServerEventType.ResponseAudioDelta, (event) => {
    const audio = Buffer.from(event.delta, "base64");
    if (speaker.writable && !speaker.closed) {
      speaker.write(audio);
    }
  });

  client.on(ServerEventType.ResponseAudioDone, (event) => {
    console.log("🤖 AI回复已完成");
    mic.stop();
    speaker.end();
    client.disconnect();
  });

  client.on(ServerEventType.ResponseDone, (event) => {
    client.disconnect();
  });

  console.log("按 Ctrl+C 退出");
}

voiceChat().catch((error) => {
  console.error("发生错误:", error);
  process.exit(1);
});

3. 功能特性

3.1 工具调用（Tool Call）支持

从 v0.0.16 开始，SDK 完全支持工具调用功能，让 AI 能够在对话过程中调用外部函数和服务。

3.1.1 添加工具

const client = new RealtimeClient({
  url: "wss://api.stepfun.com/v1/realtime",
  secret: process.env.STEP_SECRET,
});

// 添加工具
client.addTool({
  type: "function",
  function: {
    name: "get_weather",
    description: "获取指定城市的天气信息",
    parameters: {
      type: "object",
      properties: {
        city: {
          type: "string",
          description: "城市名称",
        },
      },
      required: ["city"],
    },
  },
});

// client.removeTool("get_weather")

await client.connect();

3.1.2 处理工具调用

// 监听工具调用请求
client.on(ServerEventType.ResponseFunctionCallArgumentsDone, (event) => {
  const { name, call_id, arguments: args } = event.item;

  // 执行工具调用
  if (name === "get_weather") {
    const params = JSON.parse(args);
    const weatherResult = getWeatherInfo(params.city); // 您的实现

    // 返回结果给 AI
    client.sendToolResult(call_id, JSON.stringify(weatherResult));
  }
});

// 移除工具
client.removeTool("get_weather");

当你调用了 client.sendToolResult 时，ai 并不会自动开始继续说话，你应该手动调用 client.createResponse() 方法创建回复，这个创建回复的时机很重要，可以在本地监听 音频播放完成事件 时主动调用。

比如下面的示例代码，当会在当前轮次声音播放结束后，主动让 ai 继续说话（声音播放停止时间是通过 ai 返回的音频长度推算出来的，并非真正的外放设备播放时间）：

let function_call_output_created = false;

client.on(ServerEventType.ResponseFunctionCallArgumentsDone, (ev) => {
  if (ev.name === "get_time") {
    try {
      const price = get_time();
      client!.sendToolResult(ev.call_id, JSON.stringify({ price }));
      function_call_output_created = true;
    } catch (error) {
      client!.sendToolResult(ev.call_id, JSON.stringify({ error: "Invalid call" }));
    }
  }
});

client.on(LocalEventType.ConversationAudioPlaybackCompleted, (ev) => {
  if (!ev.is_interrupted) {
    if (function_call_output_created) {
      client!.createResponse();
    }
  }
  function_call_output_created = false;
});

4. 许可证

MIT