name: llama-server

services:
  llama-server:
    image: ghcr.io/ggml-org/llama.cpp:server-b8840@sha256:99d2554c4c8d5339649dde530056cf10771823d7cd983dbd0441da9c419976b1
    container_name: llama-server
    restart: unless-stopped
    environment:
      TZ: Europe/Stockholm
      MODEL: llama-3.2-3b-q4_k_m.gguf
      CTX_SIZE: "2048"
      N_THREADS: "0"
      HOST: 0.0.0.0
      PORT: "8080"
      MAX_TOKENS: "512"
    ports:
      - target: 8080
        published: "8080"
        protocol: tcp
    volumes:
      - type: bind
        source: /DATA/AppData/$AppID/models
        target: /models
      - type: bind
        source: /DATA/AppData/$AppID/logs
        target: /logs
    deploy:
      resources:
        reservations:
          memory: 8G
    security_opt:
      - no-new-privileges:true
    cap_drop:
      - ALL
    x-casaos:
      envs:
        - container: MODEL
          description:
            en_us: Model filename inside /models (e.g. llama-3.2-3b-q4_k_m.gguf). Download GGUF files manually into /models.
        - container: CTX_SIZE
          description:
            en_us: Context window size in tokens
        - container: N_THREADS
          description:
            en_us: CPU threads (0 = auto-detect all cores)
        - container: MAX_TOKENS
          description:
            en_us: Maximum tokens to generate per response
        - container: TZ
          description:
            en_us: Timezone, for example Europe/Stockholm
      ports:
        - container: "8080"
          description:
            en_us: llama.cpp REST API port
      volumes:
        - container: /models
          description:
            en_us: Model GGUF files directory
        - container: /logs
          description:
            en_us: Server log output

x-casaos:
  architectures:
    - amd64
    - arm64
  main: llama-server
  category: phirna
  author: Joachim Friberg
  developer: Joachim Friberg
  icon: https://cdn.simpleicons.org/llama
  tagline:
    en_us: CPU-only LLM inference server with REST API
  description:
    en_us: >
      Local LLM inference server using llama.cpp. Serves GGUF models via OpenAI-compatible REST API.
      CPU-only with AVX2/AVX512 optimization. Requires manual model download.
  title:
    en_us: Llama Server
  index: /
  port_map: "8080"