[Video] Testing Google Gemini Audio Capabilities

Gemini with audio

Video

Code

Mix.install([
  {:req, "~> 0.4.14"},
  {:kino, "~> 0.12.0"}
])

Form

form =
  Kino.Control.form(
    [
      prompt: Kino.Input.textarea("Prompt"),
      audio: Kino.Input.audio("Audio", format: :wav)
    ],
    submit: "Submit"
  )

frame = Kino.Frame.new()

Kino.listen(form, fn %{data: %{prompt: prompt, audio: audio}} ->
  Kino.Frame.clear(frame)

  %{file_ref: file_ref} = audio
  file_path = Kino.Input.file_path(file_ref)

  Gemini.chat_streaming(prompt, file_path)
  |> Stream.each(&Kino.Frame.append(frame, Kino.Text.new(&1)))
  |> Stream.run()
end)

Kino.Layout.grid([form, frame])
defmodule Gemini do
  def chat_streaming(prompt, file_path) do
    pid = self()

    file_contents = File.read!(file_path)
    base64 = Base.encode64(file_contents)

    gemini_api_key = System.get_env("LB_GEMINI_API_KEY")

    Stream.resource(
      fn ->
        Task.async(fn ->
          Req.post!(
            "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:streamGenerateContent?key=#{gemini_api_key}&alt=sse",
            receive_timeout: :infinity,
            headers: [
              {"content-type", "application/json"}
            ],
            json: %{
              contents: [
                %{
                  role: "user",
                  parts: [
                    %{
                      text: prompt
                    },
                    %{
                      inlineData: %{
                        mimeType: "audio/wav",
                        data: base64
                      }
                    }
                  ]
                }
              ]
            },
            into: fn {:data, data}, {req, resp} ->
              chunks =
                data
                |> String.split("\n")
                |> Enum.filter(fn line ->
                  String.starts_with?(line, "data: {")
                end)
                |> Enum.map(fn line ->
                  line
                  |> String.replace_prefix("data: ", "")
                  |> Jason.decode!()
                  |> extract_text()
                end)

              for chunk <- chunks do
                send(pid, chunk)
              end

              {:cont, {req, resp}}
            end
          )

          send(pid, :done)
        end)
      end,
      fn task ->
        receive do
          :done ->
            {:halt, task}

          data ->
            {[data], task}
        after
          15_000 ->
            {:halt, task}
        end
      end,
      fn task -> Task.await(task, 120_000) end
    )
  end

  def extract_text(map) do
    map["candidates"]
    |> List.first()
    |> get_in(["content", "parts"])
    |> Enum.map(& &1["text"])
    |> Enum.join()
  end
end