import os from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig IMAGE_TOKEN = "[IMAGE_TOKEN]" os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" # os.environ["CUDA_VISIBLE_DEVICES"] = "2,3" # Model initialization # model = "OpenGVLab/InternVL2-26B-AWQ" # model = "OpenGVLab/InternVL2_5-4B-AWQ" model = "OpenGVLab/InternVL2_5-8B-MPO-AWQ" pipe = pipeline( model, backend_config=TurbomindEngineConfig( model_format="awq", tp=2, # tp=4, session_len=16384, # 4096, 8192, 16384, 32768 max_batch_size=1, cache_max_entry_count=0.15, # 0.05 cache_block_seq_len=16384, # 8192, 16384, 32768 # quant_policy=8, # precision="fp16", ), # log_level='DEBUG' )