Files
zima-apps/Apps/llama-server/docker-compose.yaml
T
2026-04-21 21:03:18 +02:00

83 lines
2.2 KiB
YAML

name: llama-server
services:
llama-server:
image: ghcr.io/ggml-org/llama.cpp:server-b8840@sha256:99d2554c4c8d5339649dde530056cf10771823d7cd983dbd0441da9c419976b1
container_name: llama-server
restart: unless-stopped
environment:
TZ: Europe/Stockholm
MODEL: llama-3.2-3b-q4_k_m.gguf
CTX_SIZE: "2048"
N_THREADS: "0"
HOST: 0.0.0.0
PORT: "8080"
MAX_TOKENS: "512"
ports:
- target: 8080
published: "8080"
protocol: tcp
volumes:
- type: bind
source: /DATA/AppData/$AppID/models
target: /models
- type: bind
source: /DATA/AppData/$AppID/logs
target: /logs
deploy:
resources:
reservations:
memory: 8G
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
x-casaos:
envs:
- container: MODEL
description:
en_us: Model filename inside /models (e.g. llama-3.2-3b-q4_k_m.gguf). Download GGUF files manually into /models.
- container: CTX_SIZE
description:
en_us: Context window size in tokens
- container: N_THREADS
description:
en_us: CPU threads (0 = auto-detect all cores)
- container: MAX_TOKENS
description:
en_us: Maximum tokens to generate per response
- container: TZ
description:
en_us: Timezone, for example Europe/Stockholm
ports:
- container: "8080"
description:
en_us: llama.cpp REST API port
volumes:
- container: /models
description:
en_us: Model GGUF files directory
- container: /logs
description:
en_us: Server log output
x-casaos:
architectures:
- amd64
- arm64
main: llama-server
category: phirna
author: Joachim Friberg
developer: Joachim Friberg
icon: https://cdn.simpleicons.org/llama
tagline:
en_us: CPU-only LLM inference server with REST API
description:
en_us: >
Local LLM inference server using llama.cpp. Serves GGUF models via OpenAI-compatible REST API.
CPU-only with AVX2/AVX512 optimization. Requires manual model download.
title:
en_us: Llama Server
index: /
port_map: "8080"