Ollama+openwebui

校内访问链接：http://10.2.170.115:47408/

安装ollama

curl -fsSL https://ollama.com/install.sh | sh

运行ollama

export OLLAMA_HOST=0.0.0.0
ollama serve
(base) root@c2e882693a43eed7fcb13a3df6d29cb7-taskrole1-0:~# ollama serve
time=2024-04-09T13:31:56.306+08:00 level=INFO source=images.go:804 msg="total blobs: 18"
time=2024-04-09T13:31:56.307+08:00 level=INFO source=images.go:811 msg="total unused blobs removed: 0"
time=2024-04-09T13:31:56.307+08:00 level=INFO source=routes.go:1118 msg="Listening on 127.0.0.1:11434 (version 0.1.30)"
time=2024-04-09T13:31:56.357+08:00 level=INFO source=payload_common.go:113 msg="Extracting dynamic libraries to /tmp/ollama1127788686/runners ..."
time=2024-04-09T13:31:59.767+08:00 level=INFO source=payload_common.go:140 msg="Dynamic LLM libraries [rocm_v60000 cuda_v11 cpu_avx2 cpu_avx cpu]"
time=2024-04-09T13:31:59.767+08:00 level=INFO source=gpu.go:115 msg="Detecting GPU type"
time=2024-04-09T13:31:59.767+08:00 level=INFO source=gpu.go:265 msg="Searching for GPU management library libcudart.so*"
time=2024-04-09T13:31:59.769+08:00 level=INFO source=gpu.go:311 msg="Discovered GPU libraries: [/tmp/ollama1127788686/runners/cuda_v11/libcudart.so.11.0 /usr/local/cuda/lib64/libcudart.so.12.1.55]"
time=2024-04-09T13:31:59.931+08:00 level=INFO source=gpu.go:120 msg="Nvidia GPU detected via cudart"
time=2024-04-09T13:31:59.931+08:00 level=INFO source=cpu_common.go:11 msg="CPU has AVX2"
time=2024-04-09T13:32:00.308+08:00 level=INFO source=gpu.go:188 msg="[cudart] CUDART CUDA Compute Capability detected: 8.6"
^C(base) root@c2e882693a43eed7fcb13a3df6d29cb7-taskrole1-0:~# 
(base) root@c2e882693a43eed7fcb13a3df6d29cb7-taskrole1-0:~# 
(base) root@c2e882693a43eed7fcb13a3df6d29cb7-taskrole1-0:~# export OLLAMA_HOST=0.0.0.0
(base) root@c2e882693a43eed7fcb13a3df6d29cb7-taskrole1-0:~# ollama serve
time=2024-04-09T13:34:34.936+08:00 level=INFO source=images.go:804 msg="total blobs: 18"
time=2024-04-09T13:34:34.937+08:00 level=INFO source=images.go:811 msg="total unused blobs removed: 0"
time=2024-04-09T13:34:34.937+08:00 level=INFO source=routes.go:1118 msg="Listening on [::]:11434 (version 0.1.30)"
time=2024-04-09T13:34:34.939+08:00 level=INFO source=payload_common.go:113 msg="Extracting dynamic libraries to /tmp/ollama1531663222/runners ..."
time=2024-04-09T13:34:38.387+08:00 level=INFO source=payload_common.go:140 msg="Dynamic LLM libraries [cpu_avx2 rocm_v60000 cuda_v11 cpu cpu_avx]"
time=2024-04-09T13:34:38.387+08:00 level=INFO source=gpu.go:115 msg="Detecting GPU type"
time=2024-04-09T13:34:38.387+08:00 level=INFO source=gpu.go:265 msg="Searching for GPU management library libcudart.so*"
time=2024-04-09T13:34:38.389+08:00 level=INFO source=gpu.go:311 msg="Discovered GPU libraries: [/tmp/ollama1531663222/runners/cuda_v11/libcudart.so.11.0 /usr/local/cuda/lib64/libcudart.so.12.1.55]"
time=2024-04-09T13:34:38.539+08:00 level=INFO source=gpu.go:120 msg="Nvidia GPU detected via cudart"
time=2024-04-09T13:34:38.539+08:00 level=INFO source=cpu_common.go:11 msg="CPU has AVX2"
time=2024-04-09T13:34:38.910+08:00 level=INFO source=gpu.go:188 msg="[cudart] CUDART CUDA Compute Capability detected: 8.6"
[GIN] 2024/04/09 - 13:34:42 | 200 |     100.456µs |    10.2.170.115 | GET      "/"
[GIN] 2024/04/09 - 13:34:42 | 404 |      18.205µs |    10.2.170.115 | GET      "/favicon.ico"
[GIN] 2024/04/09 - 13:34:44 | 200 |      32.386µs |    10.2.170.115 | GET      "/"
[GIN] 2024/04/09 - 13:35:50 | 200 |      59.039µs |       127.0.0.1 | HEAD     "/"
[GIN] 2024/04/09 - 13:35:50 | 200 |    1.774928ms |       127.0.0.1 | POST     "/api/show"
[GIN] 2024/04/09 - 13:35:50 | 200 |    1.215644ms |       127.0.0.1 | POST     "/api/show"
time=2024-04-09T13:35:50.620+08:00 level=INFO source=cpu_common.go:11 msg="CPU has AVX2"
time=2024-04-09T13:35:50.621+08:00 level=INFO source=gpu.go:188 msg="[cudart] CUDART CUDA Compute Capability detected: 8.6"
time=2024-04-09T13:35:50.621+08:00 level=INFO source=cpu_common.go:11 msg="CPU has AVX2"
time=2024-04-09T13:35:50.622+08:00 level=INFO source=gpu.go:188 msg="[cudart] CUDART CUDA Compute Capability detected: 8.6"
time=2024-04-09T13:35:50.622+08:00 level=INFO source=cpu_common.go:11 msg="CPU has AVX2"
loading library /tmp/ollama1531663222/runners/cuda_v11/libext_server.so
time=2024-04-09T13:35:51.108+08:00 level=INFO source=dyn_ext_server.go:87 msg="Loading Dynamic llm server: /tmp/ollama1531663222/runners/cuda_v11/libext_server.so"
time=2024-04-09T13:35:51.108+08:00 level=INFO source=dyn_ext_server.go:147 msg="Initializing llama server"
{"function":"load_model","level":"INFO","line":391,"msg":"Multi Modal Mode Enabled","tid":"140300055992064","timestamp":1712640951}
clip_model_load: model name:   openai/clip-vit-large-patch14-336
clip_model_load: description:  image encoder for LLaVA
clip_model_load: GGUF version: 3
clip_model_load: alignment:    32
clip_model_load: n_tensors:    377
clip_model_load: n_kv:         19
clip_model_load: ftype:        f16

clip_model_load: loaded meta data with 19 key-value pairs and 377 tensors from /root/.ollama/models/blobs/sha256-72d6f08a42f656d36b356dbe0920675899a99ce21192fd66266fb7d82ed07539
clip_model_load: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
clip_model_load: - kv   0:                       general.architecture str              = clip
clip_model_load: - kv   1:                      clip.has_text_encoder bool             = false
clip_model_load: - kv   2:                    clip.has_vision_encoder bool             = true
clip_model_load: - kv   3:                   clip.has_llava_projector bool             = true
clip_model_load: - kv   4:                          general.file_type u32              = 1
clip_model_load: - kv   5:                               general.name str              = openai/clip-vit-large-patch14-336
clip_model_load: - kv   6:                        general.description str              = image encoder for LLaVA
clip_model_load: - kv   7:                        clip.projector_type str              = mlp
clip_model_load: - kv   8:                     clip.vision.image_size u32              = 336
clip_model_load: - kv   9:                     clip.vision.patch_size u32              = 14
clip_model_load: - kv  10:               clip.vision.embedding_length u32              = 1024
clip_model_load: - kv  11:            clip.vision.feed_forward_length u32              = 4096
clip_model_load: - kv  12:                 clip.vision.projection_dim u32              = 768
clip_model_load: - kv  13:           clip.vision.attention.head_count u32              = 16
clip_model_load: - kv  14:   clip.vision.attention.layer_norm_epsilon f32              = 0.000010
clip_model_load: - kv  15:                    clip.vision.block_count u32              = 23
clip_model_load: - kv  16:                     clip.vision.image_mean arr[f32,3]       = [0.481455, 0.457828, 0.408211]
clip_model_load: - kv  17:                      clip.vision.image_std arr[f32,3]       = [0.268630, 0.261303, 0.275777]
clip_model_load: - kv  18:                              clip.use_gelu bool             = false
clip_model_load: - type  f32:  235 tensors
clip_model_load: - type  f16:  142 tensors
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA A40, compute capability 8.6, VMM: yes
clip_model_load: CLIP using CUDA backend
clip_model_load: text_encoder:   0
clip_model_load: vision_encoder: 1
clip_model_load: llava_projector:  1
clip_model_load: model size:     595.49 MB
clip_model_load: metadata size:  0.14 MB
clip_model_load: params backend buffer size =  595.49 MB (377 tensors)
key clip.vision.image_grid_pinpoints not found in file
key clip.vision.mm_patch_merge_type not found in file
key clip.vision.image_crop_resolution not found in file
clip_model_load: compute allocated memory: 32.89 MB
llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /root/.ollama/models/blobs/sha256-170370233dd5c5415250a2ecd5c71586352850729062ccef1496385647293868 (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = liuhaotian
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 8
llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
llama_model_loader: - kv  11:                          general.file_type u32              = 2
llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama

运行llava模型

ollama run gemma:2b

API调用

curl http://localhost:11434/api/chat -d '{
>   "model": "gemma:2b",
>   "messages": [
>     { "role": "user", "content": "为什么天空是蓝的?" }
>   ]
> }'

配置openwebui调用ollama

docker run -d -p 5000:8080 -e OLLAMA_API_BASE_URL=http://10.2.168.77:11434/api -v open-webui:/app/backend/data --name open-webui --restart always ghcr.io/open-webui/open-webui:main