name: llama-server services: llama-server: image: ghcr.io/ggml-org/llama.cpp:server-b8840@sha256:99d2554c4c8d5339649dde530056cf10771823d7cd983dbd0441da9c419976b1 container_name: llama-server restart: unless-stopped environment: TZ: Europe/Stockholm MODEL: llama-3.2-3b-q4_k_m.gguf CTX_SIZE: "2048" N_THREADS: "0" HOST: 0.0.0.0 PORT: "8080" MAX_TOKENS: "512" ports: - target: 8080 published: "8080" protocol: tcp volumes: - type: bind source: /DATA/AppData/$AppID/models target: /models - type: bind source: /DATA/AppData/$AppID/logs target: /logs deploy: resources: reservations: memory: 8G security_opt: - no-new-privileges:true cap_drop: - ALL x-casaos: envs: - container: MODEL description: en_us: Model filename inside /models (e.g. llama-3.2-3b-q4_k_m.gguf). Download GGUF files manually into /models. - container: CTX_SIZE description: en_us: Context window size in tokens - container: N_THREADS description: en_us: CPU threads (0 = auto-detect all cores) - container: MAX_TOKENS description: en_us: Maximum tokens to generate per response - container: TZ description: en_us: Timezone, for example Europe/Stockholm ports: - container: "8080" description: en_us: llama.cpp REST API port volumes: - container: /models description: en_us: Model GGUF files directory - container: /logs description: en_us: Server log output x-casaos: architectures: - amd64 - arm64 main: llama-server category: phirna author: Joachim Friberg developer: Joachim Friberg icon: https://cdn.simpleicons.org/llama tagline: en_us: CPU-only LLM inference server with REST API description: en_us: > Local LLM inference server using llama.cpp. Serves GGUF models via OpenAI-compatible REST API. CPU-only with AVX2/AVX512 optimization. Requires manual model download. title: en_us: Llama Server index: / port_map: "8080"