Add llama-server and open-webui apps for local LLM inference

- llama-server: llama.cpp REST API server, 8G memory, port 8080 - open-webui: Chat UI connecting to llama-server, 2G memory, port 3000 - Both include x-casaos metadata for ZimaOS app store - README with model download instructions and API examples
2026-04-19 22:25:22 +02:00
parent 231aba08b0
commit 0aabfc8a72
4 changed files with 309 additions and 0 deletions
@@ -0,0 +1,82 @@
+name: llama-server
+
+services:
+  llama-server:
+    image: ghcr.io/ggerganov/llama.cpp:server
+    container_name: llama-server
+    restart: unless-stopped
+    environment:
+      TZ: Europe/Stockholm
+      MODEL: llama-3.2-3b-q4_k_m.gguf
+      CTX_SIZE: "2048"
+      N_THREADS: "0"
+      HOST: 0.0.0.0
+      PORT: "8080"
+      MAX_TOKENS: "512"
+    ports:
+      - target: 8080
+        published: "8080"
+        protocol: tcp
+    volumes:
+      - type: bind
+        source: /DATA/AppData/$AppID/models
+        target: /models
+      - type: bind
+        source: /DATA/AppData/$AppID/logs
+        target: /logs
+    deploy:
+      resources:
+        reservations:
+          memory: 8G
+    security_opt:
+      - no-new-privileges:true
+    cap_drop:
+      - ALL
+    x-casaos:
+      envs:
+        - container: MODEL
+          description:
+            en_us: Model filename inside /models (e.g. llama-3.2-3b-q4_k_m.gguf). Download GGUF files manually into /models.
+        - container: CTX_SIZE
+          description:
+            en_us: Context window size in tokens
+        - container: N_THREADS
+          description:
+            en_us: CPU threads (0 = auto-detect all cores)
+        - container: MAX_TOKENS
+          description:
+            en_us: Maximum tokens to generate per response
+        - container: TZ
+          description:
+            en_us: Timezone, for example Europe/Stockholm
+      ports:
+        - container: "8080"
+          description:
+            en_us: llama.cpp REST API port
+      volumes:
+        - container: /models
+          description:
+            en_us: Model GGUF files directory
+        - container: /logs
+          description:
+            en_us: Server log output
+
+x-casaos:
+  architectures:
+    - amd64
+    - arm64
+  main: llama-server
+  category: ai
+  author: Joachim Friberg
+  developer: Joachim Friberg
+  icon: https://cdn.simpleicons.org/llama
+  tagline:
+    en_us: CPU-only LLM inference server with REST API
+  description:
+    en_us: >
+      Local LLM inference server using llama.cpp. Serves GGUF models via OpenAI-compatible REST API.
+      CPU-only with AVX2/AVX512 optimization. Requires manual model download.
+  title:
+    en_us: Llama Server
+  index: /
+  port_map: "8080"