-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcompose-llms.yaml
44 lines (42 loc) · 965 Bytes
/
compose-llms.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
version: '3'
services:
tgi:
image: ghcr.io/huggingface/text-generation-inference:latest
container_name: tgi
ports:
- 8080:80
volumes:
- ${LOCAL_MODEL_CACHE_DIR}:/model_cache
environment:
- HUGGING_FACE_HUB_TOKEN=${LLAMA_TOKEN}
# need this to access GPU
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
command:
- '--huggingface-hub-cache'
- '/model_cache'
- '--model-id'
- '${MODEL_ID}'
- '--max-batch-prefill-tokens'
- '${MAX_PREFILL_TOKENS}'
- '--quantize'
- '${QUANT}'
- '--max-total-tokens'
- '${MAX_TOTAL_TOKENS}'
- '--max-input-length'
- '${MAX_INPUT_LENGTH}'
shm_size: 1gb
ui:
image: localllm-ui:latest
container_name: ui
build:
context: ./chat_ui/
ports:
- 7000:7000
# api:
# image: