updated image and video upload

This commit is contained in:
Zixiao Wang 2025-01-24 14:11:46 +08:00
parent 37276d48f9
commit 5c00be4edd
8 changed files with 173 additions and 175 deletions

View File

@ -1,54 +1,51 @@
from fastapi import UploadFile, Form from fastapi import UploadFile, Form
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
import io import io
import base64
import asyncio import asyncio
import numpy as np
from PIL import Image from PIL import Image
from pipeline_setup import pipe, IMAGE_TOKEN from pipeline_setup import pipe, IMAGE_TOKEN
from utils.image_processing import encode_image_base64 from utils.image_processing import encode_image_base64
async def image_query(file: UploadFile, question: str = Form(...)):
"""
API endpoint to process an image with the user's query.
"""
try:
if file.content_type not in ["image/jpeg", "image/png"]:
return JSONResponse({"query": question, "error": "Unsupported file type."})
image_data = await file.read()
image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
encoded_image_base64 = encode_image_base64(image)
question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
response = await asyncio.to_thread(pipe, (question, image))
return JSONResponse({"query": question, "response": response.text})
except Exception as e:
return JSONResponse({"query": question, "error": str(e)})
# import mimetypes
# async def image_query(file: UploadFile, question: str = Form(...)): # async def image_query(file: UploadFile, question: str = Form(...)):
# """ # """
# API endpoint to process an image with the user's query. # API endpoint to process an image with the user's query.
# """ # """
# try: # try:
# # Get the file path from the UploadFile object # if file.content_type not in ["image/jpeg", "image/png"]:
# file_path = file.filename # return JSONResponse({"query": question, "error": "Unsupported file type."})
# # Determine the file type using the file extension
# file_type, _ = mimetypes.guess_type(file_path)
# if file_type not in ["image/jpeg", "image/png"]:
# return {"query": question, "error": "Unsupported file type."}
# # Read the image file
# image_data = await file.read() # image_data = await file.read()
# image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512)) # image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
# encoded_image_base64 = encode_image_base64(image) # encoded_image_base64 = encode_image_base64(image)
# # Prepare the query with the image token
# question_with_image_token = f"{question}\n{IMAGE_TOKEN}" # question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
# # Query the model
# response = await asyncio.to_thread(pipe, (question, image)) # response = await asyncio.to_thread(pipe, (question, image))
# return {"query": question, "response": response.text} # return JSONResponse({"query": question, "response": response.text})
# except Exception as e: # except Exception as e:
# return {"query": question, "error": str(e)} # return JSONResponse({"query": question, "error": str(e)})
# import mimetypes
async def image_query(image: np.ndarray, question: str):
"""
API endpoint to process an image (as numpy array) with the user's query.
"""
try:
# Convert the numpy array to a PIL Image
image = Image.fromarray(image).convert("RGB").resize((512, 512))
# Encode the image to base64 (optional, if needed by your pipeline)
buffered = io.BytesIO()
image.save(buffered, format="JPEG")
encoded_image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
# Prepare the query with the image token
question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
# Query the model
response = await asyncio.to_thread(pipe, (question, image))
return {"query": question, "response": response.text}
except Exception as e:
return {"query": question, "error": str(e)}

View File

@ -3,22 +3,22 @@ from fastapi.responses import JSONResponse
from asyncio import to_thread from asyncio import to_thread
from pipeline_setup import pipe from pipeline_setup import pipe
async def text_query(question: str = Form(...)):
"""
API endpoint to process text input with the user's query.
"""
try:
response = await to_thread(pipe, question)
return JSONResponse({"query": question, "response": response.text})
except Exception as e:
return JSONResponse({"query": question, "error": str(e)})
# async def text_query(question: str = Form(...)): # async def text_query(question: str = Form(...)):
# """ # """
# API endpoint to process text input with the user's query. # API endpoint to process text input with the user's query.
# """ # """
# try: # try:
# response = await to_thread(pipe, question) # response = await to_thread(pipe, question)
# return {"query": question, "response": response.text} # return JSONResponse({"query": question, "response": response.text})
# except Exception as e: # except Exception as e:
# return {"query": question, "error": str(e)} # return JSONResponse({"query": question, "error": str(e)})
async def text_query(question: str = Form(...)):
"""
API endpoint to process text input with the user's query.
"""
try:
response = await to_thread(pipe, question)
return {"query": question, "response": response.text}
except Exception as e:
return {"query": question, "error": str(e)}

View File

@ -5,118 +5,9 @@ from utils.image_processing import encode_image_base64
from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video
from utils.audio_transcription import transcribe_audio from utils.audio_transcription import transcribe_audio
import asyncio import asyncio
import time import mimetypes
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
async def video_query(file: UploadFile, question: str = Form(...)):
"""
API endpoint to process a video file with the user's query.
"""
try:
print("Processing video...")
# Validate file type
if file.content_type not in ["video/mp4", "video/avi", "video/mkv"]:
return JSONResponse({"query": question, "error": "Unsupported video file type."})
# Start overall timer
overall_start_time = time.time()
# Save the uploaded video to a temporary file
print("Reading video...")
video_data = await file.read()
temp_video_path = "/tmp/temp_video.mp4"
with open(temp_video_path, "wb") as temp_video_file:
temp_video_file.write(video_data)
print(f"Temp video saved to: {temp_video_path}")
# Record the time after reading the video
video_reading_time = time.time()
# Split the video into segments
print("Splitting video...")
segments = split_video_into_segments(temp_video_path, segment_duration=30)
print(f"Video split into {len(segments)} segments.")
aggregated_responses = []
segment_timings = []
for i, segment_path in enumerate(segments):
print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
# Start timing for the segment
segment_start_time = time.time()
# Extract key frames
frame_start_time = time.time()
imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
frame_time = time.time()
# Extract audio and transcribe
audio_start_time = time.time()
audio_path = extract_audio_from_video(segment_path)
transcribed_text = transcribe_audio(audio_path)
audio_time = time.time()
# Combine transcribed text with the query
combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
# Prepare content for the pipeline
question_with_frames = ""
for j, img in enumerate(imgs):
question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
question_with_frames += combined_query
content = [{"type": "text", "text": question_with_frames}]
for img in imgs:
content.append({
"type": "image_url",
"image_url": {
"max_dynamic_patch": 1,
"url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
}
})
# Query the model
inference_start_time = time.time()
messages = [dict(role="user", content=content)]
response = await asyncio.to_thread(pipe, messages)
inference_time = time.time()
# Aggregate response
aggregated_responses.append(response.text)
# Calculate timing for the segment
segment_timings.append({
"segment_index": i + 1,
"segment_processing_time": inference_time - segment_start_time,
"frame_extraction_time": frame_time - frame_start_time,
"audio_extraction_time": audio_time - audio_start_time,
"model_inference_time": inference_time - inference_start_time
})
print(f"transcription: {transcribed_text}")
# print(f"content: {content}")
overall_end_time = time.time()
# Aggregate total timings
total_timings = {
"video_reading_time": video_reading_time - overall_start_time,
"total_segments": len(segments),
"total_processing_time": overall_end_time - overall_start_time,
"segment_details": segment_timings
}
return JSONResponse({
"question": question,
"responses": aggregated_responses,
"timings": total_timings,
})
except Exception as e:
return JSONResponse({"query": question, "error": str(e)})
# async def video_query(file: UploadFile, question: str = Form(...)): # async def video_query(file: UploadFile, question: str = Form(...)):
# """ # """
# API endpoint to process a video file with the user's query. # API endpoint to process a video file with the user's query.
@ -124,13 +15,9 @@ async def video_query(file: UploadFile, question: str = Form(...)):
# try: # try:
# print("Processing video...") # print("Processing video...")
# # Get the file path from the UploadFile object # # Validate file type
# file_path = file.filename # if file.content_type not in ["video/mp4", "video/avi", "video/mkv"]:
# return JSONResponse({"query": question, "error": "Unsupported video file type."})
# # Determine the file type using the file extension
# file_type, _ = mimetypes.guess_type(file_path)
# if file_type is None or not file_type.startswith("video/"):
# return {"query": question, "error": "Unsupported video file type."}
# # Start overall timer # # Start overall timer
# overall_start_time = time.time() # overall_start_time = time.time()
@ -209,6 +96,7 @@ async def video_query(file: UploadFile, question: str = Form(...)):
# }) # })
# print(f"transcription: {transcribed_text}") # print(f"transcription: {transcribed_text}")
# # print(f"content: {content}")
# overall_end_time = time.time() # overall_end_time = time.time()
@ -220,10 +108,80 @@ async def video_query(file: UploadFile, question: str = Form(...)):
# "segment_details": segment_timings # "segment_details": segment_timings
# } # }
# return { # return JSONResponse({
# "question": question, # "question": question,
# "responses": aggregated_responses, # "responses": aggregated_responses,
# "timings": total_timings, # "timings": total_timings,
# } # })
# except Exception as e: # except Exception as e:
# return {"query": question, "error": str(e)} # return JSONResponse({"query": question, "error": str(e)})
async def video_query(video_path: str, question: str):
"""
API endpoint to process a video file with the user's query.
"""
try:
print("Processing video...")
if not video_path or not isinstance(video_path, str):
return {"query": question, "error": "No video file provided or invalid file input."}
# Determine the file type using the file extension
file_type, _ = mimetypes.guess_type(video_path)
if file_type is None or not file_type.startswith("video/"):
return {"query": question, "error": "Unsupported video file type."}
# Log the video path
print(f"Video path: {video_path}")
# Split the video into segments
print("Splitting video...")
segments = split_video_into_segments(video_path, segment_duration=30)
print(f"Video split into {len(segments)} segments.")
aggregated_responses = []
segment_timings = []
for i, segment_path in enumerate(segments):
print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
# Extract key frames
imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
# Extract audio and transcribe
audio_path = extract_audio_from_video(segment_path)
transcribed_text = transcribe_audio(audio_path)
# Combine transcribed text with the query
combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
# Prepare content for the pipeline
question_with_frames = ""
for j, img in enumerate(imgs):
question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
question_with_frames += combined_query
content = [{"type": "text", "text": question_with_frames}]
for img in imgs:
content.append({
"type": "image_url",
"image_url": {
"max_dynamic_patch": 1,
"url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
}
})
# Query the model
messages = [dict(role="user", content=content)]
response = await asyncio.to_thread(pipe, messages)
# Aggregate response
aggregated_responses.append(response.text)
return {
"question": question,
"responses": aggregated_responses,
}
except Exception as e:
return {"query": question, "error": str(e)}

2
flagged/log.csv Normal file
View File

@ -0,0 +1,2 @@
prompts,Response,flag,username,timestamp
"{""image"": ""flagged/prompts/fdd45d065153a29e7e3d/1.2.png"", ""points"": []}",,,,2025-01-24 11:09:07.710989
1 prompts Response flag username timestamp
2 {"image": "flagged/prompts/fdd45d065153a29e7e3d/1.2.png", "points": []} 2025-01-24 11:09:07.710989

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@ -9,8 +9,9 @@ pipe = pipeline(
model, model,
backend_config=TurbomindEngineConfig( backend_config=TurbomindEngineConfig(
model_format="awq", model_format="awq",
tp=2, # tp=2,
device_ids=[0, 1], tp=4,
# device_ids=[0, 1],
session_len=12864, session_len=12864,
max_batch_size=1, max_batch_size=1,
cache_max_entry_count=0.05, cache_max_entry_count=0.05,

54
ui.py
View File

@ -1,9 +1,17 @@
import gradio as gr import os
import asyncio import asyncio
import gradio as gr
from gradio_image_prompter import ImagePrompter
from endpoints.text import text_query from endpoints.text import text_query
from endpoints.image import image_query from endpoints.image import image_query
from endpoints.video import video_query from endpoints.video import video_query
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
import torch
print("Available GPUs:", torch.cuda.device_count())
print("Visible Devices:", [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())])
def setup_ui(): def setup_ui():
with gr.Blocks() as ui: with gr.Blocks() as ui:
gr.Markdown( gr.Markdown(
@ -32,13 +40,39 @@ def setup_ui():
with gr.Tab("Image Query"): with gr.Tab("Image Query"):
gr.Markdown("### Submit an Image Query") gr.Markdown("### Submit an Image Query")
with gr.Row(): with gr.Row():
image_input = gr.File(label="Upload Image") image_prompter = ImagePrompter(show_label=False)
image_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...") image_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
image_button = gr.Button("Submit") image_button = gr.Button("Submit")
image_output = gr.Textbox(label="Response", interactive=False) image_output = gr.Textbox(label="Response", interactive=False)
# async def handle_image_query(prompts, question):
# response = await image_query(prompts["image"], question)
# return response["response"] if "response" in response else response["error"]
async def handle_image_query(prompts, question):
"""
Handles the image query and ensures that inputs are valid.
"""
try:
# Validate prompts
if prompts is None or "image" not in prompts:
return "No image provided. Please upload an image."
image_data = prompts["image"]
# Check if image_data is valid
if image_data is None:
return "Invalid image input. Please upload a valid image."
# Call the `image_query` function
response = await image_query(image_data, question)
return response["response"] if "response" in response else response["error"]
except Exception as e:
return str(e)
image_button.click( image_button.click(
fn=lambda img, q: asyncio.run(image_query(img, q)), fn=handle_image_query,
inputs=[image_input, image_question_input], inputs=[image_prompter, image_question_input],
outputs=[image_output] outputs=[image_output]
) )
@ -46,18 +80,24 @@ def setup_ui():
with gr.Tab("Video Query"): with gr.Tab("Video Query"):
gr.Markdown("### Submit a Video Query") gr.Markdown("### Submit a Video Query")
with gr.Row(): with gr.Row():
video_input = gr.File(label="Upload Video") video_input = gr.Video(label="Upload Video")
video_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...") video_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
video_button = gr.Button("Submit") video_button = gr.Button("Submit")
video_output = gr.Textbox(label="Response", interactive=False) video_output = gr.Textbox(label="Response", interactive=False)
async def handle_video_query(video, question):
response = await video_query(video, question)
return response.get("responses", response.get("error", "Error processing video."))
video_button.click( video_button.click(
fn=lambda vid, q: asyncio.run(video_query(vid, q)), fn=handle_video_query,
inputs=[video_input, video_question_input], inputs=[video_input, video_question_input],
outputs=[video_output] outputs=[video_output]
) )
return ui return ui
if __name__ == "__main__": if __name__ == "__main__":
ui = setup_ui() ui = setup_ui()
ui.launch(server_name="0.0.0.0", server_port=7860) ui.launch(server_name="0.0.0.0", server_port=8002)