83 lines
2.2 KiB
YAML
83 lines
2.2 KiB
YAML
name: llama-server
|
|
|
|
services:
|
|
llama-server:
|
|
image: ghcr.io/ggml-org/llama.cpp:server-b8840@sha256:99d2554c4c8d5339649dde530056cf10771823d7cd983dbd0441da9c419976b1
|
|
container_name: llama-server
|
|
restart: unless-stopped
|
|
environment:
|
|
TZ: Europe/Stockholm
|
|
MODEL: llama-3.2-3b-q4_k_m.gguf
|
|
CTX_SIZE: "2048"
|
|
N_THREADS: "0"
|
|
HOST: 0.0.0.0
|
|
PORT: "8080"
|
|
MAX_TOKENS: "512"
|
|
ports:
|
|
- target: 8080
|
|
published: "8080"
|
|
protocol: tcp
|
|
volumes:
|
|
- type: bind
|
|
source: /DATA/AppData/$AppID/models
|
|
target: /models
|
|
- type: bind
|
|
source: /DATA/AppData/$AppID/logs
|
|
target: /logs
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
memory: 8G
|
|
security_opt:
|
|
- no-new-privileges:true
|
|
cap_drop:
|
|
- ALL
|
|
x-casaos:
|
|
envs:
|
|
- container: MODEL
|
|
description:
|
|
en_us: Model filename inside /models (e.g. llama-3.2-3b-q4_k_m.gguf). Download GGUF files manually into /models.
|
|
- container: CTX_SIZE
|
|
description:
|
|
en_us: Context window size in tokens
|
|
- container: N_THREADS
|
|
description:
|
|
en_us: CPU threads (0 = auto-detect all cores)
|
|
- container: MAX_TOKENS
|
|
description:
|
|
en_us: Maximum tokens to generate per response
|
|
- container: TZ
|
|
description:
|
|
en_us: Timezone, for example Europe/Stockholm
|
|
ports:
|
|
- container: "8080"
|
|
description:
|
|
en_us: llama.cpp REST API port
|
|
volumes:
|
|
- container: /models
|
|
description:
|
|
en_us: Model GGUF files directory
|
|
- container: /logs
|
|
description:
|
|
en_us: Server log output
|
|
|
|
x-casaos:
|
|
architectures:
|
|
- amd64
|
|
- arm64
|
|
main: llama-server
|
|
category: ai
|
|
author: Joachim Friberg
|
|
developer: Joachim Friberg
|
|
icon: https://cdn.simpleicons.org/llama
|
|
tagline:
|
|
en_us: CPU-only LLM inference server with REST API
|
|
description:
|
|
en_us: >
|
|
Local LLM inference server using llama.cpp. Serves GGUF models via OpenAI-compatible REST API.
|
|
CPU-only with AVX2/AVX512 optimization. Requires manual model download.
|
|
title:
|
|
en_us: Llama Server
|
|
index: /
|
|
port_map: "8080"
|