zhengr commited on
Commit
df054a5
1 Parent(s): 553f5a5
Files changed (3) hide show
  1. Dockerfile +1 -0
  2. config.yaml +13 -0
  3. entrypoint.sh +12 -11
Dockerfile CHANGED
@@ -15,6 +15,7 @@ RUN chmod -R 777 /.ollama
15
  WORKDIR /.ollama
16
 
17
  # Copy the entry point script
 
18
  COPY entrypoint.sh /entrypoint.sh
19
  RUN chmod +x /entrypoint.sh
20
 
 
15
  WORKDIR /.ollama
16
 
17
  # Copy the entry point script
18
+ COPY config.yaml /config.yaml
19
  COPY entrypoint.sh /entrypoint.sh
20
  RUN chmod +x /entrypoint.sh
21
 
config.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_list:
2
+ - model_name: gollama-mistral-7b
3
+ litellm_params:
4
+ model: ollama/mistral:7b
5
+ api_base: https://zhengr-ollama.hf.space
6
+
7
+ litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
8
+ drop_params: True
9
+ success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env
10
+
11
+ general_settings:
12
+ master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
13
+ alerting: ["slack"] # [OPTIONAL] If you want Slack Alerts for Hanging LLM requests, Slow llm responses, Budget Alerts. Make sure to set `SLACK_WEBHOOK_URL` in your env
entrypoint.sh CHANGED
@@ -2,21 +2,22 @@
2
 
3
  # Starting server
4
  echo "Starting server"
5
- ollama serve &
6
- sleep 1
7
 
8
  # Splitting the models by comma and pulling each
9
- IFS=',' read -ra MODELS <<< "$model"
10
- for m in "${MODELS[@]}"; do
11
- echo "Pulling $m"
12
- ollama pull "$m"
13
- sleep 5
14
- echo "Running $m"
15
- ollama run "$m" --keepalive -1s
16
  # No need to sleep here unless you want to give some delay between each pull for some reason
17
- done
18
 
19
- litellm --model ollama/"$m" --drop_params
 
20
 
21
  # Keep the script running to prevent the container from exiting
22
  wait
 
2
 
3
  # Starting server
4
  echo "Starting server"
5
+ #ollama serve &
6
+ #sleep 1
7
 
8
  # Splitting the models by comma and pulling each
9
+ # IFS=',' read -ra MODELS <<< "$model"
10
+ # for m in "${MODELS[@]}"; do
11
+ # echo "Pulling $m"
12
+ # ollama pull "$m"
13
+ # sleep 5
14
+ # echo "Running $m"
15
+ # ollama run "$m" --keepalive -1s
16
  # No need to sleep here unless you want to give some delay between each pull for some reason
17
+ # done
18
 
19
+ #litellm --model ollama/"$m" --drop_params
20
+ litellm --config /config.yaml
21
 
22
  # Keep the script running to prevent the container from exiting
23
  wait