Deploying LLM Applications to Production

The gap between a working LLM demo and a production LLM application is enormous. Demos tolerate latency, ignore errors, and don’t worry about cost. Production requires all three.

The Production Checklist

Before deploying any LLM feature, verify:

Rate limiting — Can you handle API rate limits gracefully?
Fallbacks — What happens when the LLM API is down?
Cost controls — Do you have spend alerts and hard caps?
Latency budgets — Are timeouts configured? Streaming enabled?
Content filtering — Are you filtering harmful outputs?
Observability — Can you debug failures after the fact?

Rate Limiting and Backpressure

LLM APIs have rate limits. Respect them or get throttled:

defmodule LLM.RateLimiter do
  use GenServer

  def start_link(opts) do
    max_rpm = Keyword.get(opts, :max_rpm, 1000)
    GenServer.start_link(__MODULE__, %{max_rpm: max_rpm, count: 0}, name: __MODULE__)
  end

  def acquire do
    GenServer.call(__MODULE__, :acquire, 30_000)
  end

  def handle_call(:acquire, _from, %{count: count, max_rpm: max} = state)
      when count >= max do
    Process.send_after(self(), :reset, 60_000)
    {:reply, {:error, :rate_limited}, state}
  end

  def handle_call(:acquire, _from, %{count: count} = state) do
    {:reply, :ok, %{state | count: count + 1}}
  end

  def handle_info(:reset, state) do
    {:noreply, %{state | count: 0}}
  end
end

Fallback Strategies

Never depend on a single LLM provider:

defmodule LLM.WithFallback do
  @providers [
    {LLM.Anthropic, "claude-sonnet-4-6"},
    {LLM.OpenAI, "gpt-4o"},
    {LLM.Local, "llama-3-70b"}
  ]

  def complete(prompt, opts \\ []) do
    Enum.reduce_while(@providers, {:error, :all_failed}, fn {provider, model}, _acc ->
      case provider.complete(prompt, Keyword.put(opts, :model, model)) do
        {:ok, response} -> {:halt, {:ok, response}}
        {:error, _reason} -> {:cont, {:error, :all_failed}}
      end
    end)
  end
end

Observability

Log everything. You’ll need it when debugging that weird edge case at 3am:

defmodule LLM.Logger do
  require Logger

  def complete(prompt, opts \\ []) do
    start_time = System.monotonic_time(:millisecond)
    request_id = Ecto.UUID.generate()

    Logger.metadata(llm_request_id: request_id)

    result = LLM.API.complete(prompt, opts)

    duration = System.monotonic_time(:millisecond) - start_time

    log_entry = %{
      request_id: request_id,
      model: Keyword.get(opts, :model),
      prompt_tokens: count_tokens(prompt),
      duration_ms: duration,
      status: if(match?({:ok, _}, result), do: :success, else: :error)
    }

    :telemetry.execute([:llm, :complete], %{duration: duration}, log_entry)
    Logger.info("LLM call completed", log_entry)

    result
  end
end

Cost Controls

Set hard limits to prevent runaway spending:

defmodule LLM.CostGuard do
  @daily_budget_cents 5000  # $50/day

  def check_budget do
    today = Date.utc_today()
    spent = Repo.one(
      from l in LLMLog,
      where: fragment("DATE(?)", l.inserted_at) == ^today,
      select: sum(l.cost_cents)
    ) || 0

    if spent >= @daily_budget_cents do
      {:error, :budget_exceeded}
    else
      :ok
    end
  end
end

Streaming for UX

For user-facing features, stream responses to reduce perceived latency:

defmodule LLM.Stream do
  def stream_to_liveview(prompt, socket) do
    Task.start(fn ->
      LLM.API.stream(prompt, fn
        {:chunk, text} ->
          send(socket.transport_pid, {:llm_chunk, text})

        :done ->
          send(socket.transport_pid, :llm_done)
      end)
    end)
  end
end

The Golden Rule

Production LLM apps should fail gracefully, not silently. Every error should be logged, every fallback should be monitored, and every cost should be tracked. The LLM is a powerful but unreliable dependency — treat it with the same caution you’d give any external API.