25 lines
658 B
Python
25 lines
658 B
Python
import os
|
|
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
|
|
|
|
IMAGE_TOKEN = "[IMAGE_TOKEN]"
|
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
|
|
# os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
|
|
|
|
# Model initialization
|
|
model = "OpenGVLab/InternVL2-26B-AWQ"
|
|
pipe = pipeline(
|
|
model,
|
|
backend_config=TurbomindEngineConfig(
|
|
model_format="awq",
|
|
tp=2,
|
|
# tp=4,
|
|
session_len=16384, # 4096, 8192, 16384, 32768
|
|
max_batch_size=1,
|
|
cache_max_entry_count=0.2, # 0.05
|
|
cache_block_seq_len=16384, # 8192, 16384, 32768
|
|
# quant_policy=8,
|
|
# precision="fp16",
|
|
),
|
|
# log_level='DEBUG'
|
|
) |