updated readme

This commit is contained in:
Zixiao Wang 2025-05-12 11:22:46 +08:00
parent d956ce6678
commit 7fb9de49b7
13 changed files with 88 additions and 137 deletions

View File

@ -1,2 +1,52 @@
# api
# API Usage Guide
This project supports multiple methods to start and interact with the API.
## Method 1: Standard API Start
Start the API using the main script:
```
python main.py
```
## Method 2: Gradio UI
To use the Gradio UI:
1. In the `endpoints/` directory, change the comment to enable 'gradio'.
2. Run the UI script:
```
python ui.py
```
## Method 3: Celery Optimization
Use Celery for optimized background task processing. You can adjust the parameters for concurrency as needed.
### Step 1: Start Redis Server
```
redis-server
```
### Step 2: Start Celery Workers
Open two separate terminals and run the following:
**Terminal 1 (Preprocessing Queue):**
```
celery -A tasks worker --pool=threads --loglevel=info --concurrency=2 --queues=preprocess_queue
```
**Terminal 2 (Inference Queue):**
```
celery -A tasks worker --pool=threads --loglevel=info --concurrency=3 --queues=inference_queue
```
### Step 3: Run Debug Script
```
python celery_debug.py
```

Binary file not shown.

Binary file not shown.

View File

@ -41,7 +41,7 @@ print("Broker:", app.conf.broker_url)
# Define the number of concurrent tasks
NUM_TASKS = 1
delay_seconds = 0
file_paths = [f"../video/film4.mp4" for _ in range(NUM_TASKS)]
file_paths = [f"../video/3.mp4" for _ in range(NUM_TASKS)]
# video_folder = "../video"
@ -71,29 +71,29 @@ file_paths = [f"../video/film4.mp4" for _ in range(NUM_TASKS)]
# for i in range(NUM_TASKS)
# ]
# questions = [
# f"Based on the given images and audio script, extract detailed information about the products recommended in the video and format the output as JSON with the following fields:\n"
# f"- Product Name: <name>\n"
# f"- Category: <category>\n"
# f"- Styles or Variants: <styles/variants>\n"
# f"- Highlights: <highlights>\n"
# f"- Promotional Details: <promotional_details>\n"
# f"Do not include any disclaimers or comments like 'I'm sorry' or 'I can't assist.' Task {i}"
# for i in range(NUM_TASKS)
# ]
questions = [
f"Generate a screenplay based on the given video content and format the output as JSON with the following structured fields:\n"
f"- Scene Descriptions: <visual setting including background, lighting, atmosphere>\n"
f"- Character Introductions: <key characters with appearance and notable traits>\n"
f"- Dialogue: <transcribed spoken lines in screenplay format>\n"
f"- Actions & Expressions: <non-verbal cues and interactions>\n"
f"- Product Integrations: <product names, categories, features if applicable>\n"
f"- Narrative Flow: <scene transitions and pacing notes>\n"
f"Follow standard screenplay formatting for headers, character names, dialogue, and actions. Do not include disclaimers or comments like 'I can't assist.' Task {i}"
f"Based on the given images and audio script, extract detailed information about the products recommended in the video and format the output as JSON with the following fields:\n"
f"- Product Name: <name>\n"
f"- Category: <category>\n"
f"- Styles or Variants: <styles/variants>\n"
f"- Highlights: <highlights>\n"
f"- Promotional Details: <promotional_details>\n"
f"Do not include any disclaimers or comments like 'I'm sorry' or 'I can't assist.' Task {i}"
for i in range(NUM_TASKS)
]
# questions = [
# f"Generate a screenplay based on the given video content and format the output as JSON with the following structured fields:\n"
# f"- Scene Descriptions: <visual setting including background, lighting, atmosphere>\n"
# f"- Character Introductions: <key characters with appearance and notable traits>\n"
# f"- Dialogue: <transcribed spoken lines in screenplay format>\n"
# f"- Actions & Expressions: <non-verbal cues and interactions>\n"
# f"- Product Integrations: <product names, categories, features if applicable>\n"
# f"- Narrative Flow: <scene transitions and pacing notes>\n"
# f"Follow standard screenplay formatting for headers, character names, dialogue, and actions. Do not include disclaimers or comments like 'I can't assist.' Task {i}"
# for i in range(NUM_TASKS)
# ]
# def submit_task(question, index): # sends tasks to Celery asynchronously, queues the tasks in Celery broker. If multiple Celery workers, they process tasks in parallel.
# """ Submits a Celery task with increasing delay """
# countdown_time = index * delay_seconds # Dynamic delay

View File

@ -8,6 +8,7 @@ from PIL import Image
from pipeline_setup import pipe, IMAGE_TOKEN
from utils.image_processing import encode_image_base64
# api
async def image_query(file: UploadFile, question: str = Form(...)):
"""
API endpoint to process an image with the user's query.
@ -26,11 +27,8 @@ async def image_query(file: UploadFile, question: str = Form(...)):
except Exception as e:
return JSONResponse({"query": question, "error": str(e)})
# gradio
# async def image_query(image: np.ndarray, question: str):
# """
# API endpoint to process an image (as numpy array) with the user's query.
# """
# try:
# # Convert the numpy array to a PIL Image
# image = Image.fromarray(image).convert("RGB").resize((512, 512))
@ -49,7 +47,7 @@ async def image_query(file: UploadFile, question: str = Form(...)):
# except Exception as e:
# return {"query": question, "error": str(e)}
# celery
# def image_query(image_path: str, question: str):
# try:
# print("image_path in image_query...")

View File

@ -24,6 +24,7 @@ def load_checkpoint(video_id):
return json.load(f)
return None
# api
# async def video_query(file: UploadFile, question: str = Form(...)):
# try:
# print("Processing video...")
@ -131,7 +132,7 @@ def load_checkpoint(video_id):
# except Exception as e:
# return JSONResponse({"query": question, "error": str(e)})
# gradio
# async def video_query(video_path: str, question: str):
# """
# API endpoint to process a video file with the user's query.

11
prompt.txt Normal file
View File

@ -0,0 +1,11 @@
图片
"Extract the following information from this image and return the result in JSON format:\n" "- Name: <name>\n" "- ID: <id>\n" "- Profile Picture: <url>\n" "- Follower Count: <count>\n" "- Likes Count: <count>\n" "- Bio: <bio>\n" "- Following Count: <count>\n" "- External Links: <links>\n" # "Provide no additional text other than the extracted information." "Do not include any disclaimers or comments like 'I'm sorry' or 'I can't assist'."
视频
“Based on the given images and audio script, extract detailed information about the products recommended in the video and format the output as JSON with the following fields:
1. **Product Name**: The specific name of the product, if mentioned.
2. **Category**: The specific category of the product (e.g., electronics, skincare, casual wear, etc.).
3. **Styles or Variants**: Any styles, designs, or variants of the product described (e.g., colors, patterns, sizes, or other distinguishing attributes).
4. **Highlights**: The unique selling points or notable features emphasized by the anchor (e.g., benefits, quality, or standout features).
5. **Promotional Details**: Any additional promotional information mentioned, such as discounts, offers, or key features that set the product apart.”

View File

@ -6,8 +6,6 @@ mp.set_start_method("spawn", force=True)
# mp.set_start_method("fork", force=True)
from celery import Celery
import psutil
from pynvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetUtilizationRates
# from endpoints.video import run_video_inference
@ -54,11 +52,6 @@ app.conf.worker_prefetch_multiplier = 1
# include=["tasks"]
# )
# app.conf.task_routes = {
# 'tasks.preprocess_video': {'queue': 'preprocess_queue'},
# 'tasks.inference_video': {'queue': 'inference_queue'},
# }
app.conf.task_routes = {
'tasks.preprocess_video': {'queue': 'preprocess_queue'},
'tasks.inference_video': {'queue': 'inference_queue'},
@ -93,11 +86,6 @@ app.conf.task_routes = {
# celery.conf.task_time_limit = 60 # 60 seconds max execution time
# celery.conf.task_soft_time_limit = 50 # Warn at 50 seconds
@app.task
def add(x, y):
print("Adding task...")
return x + y
@app.task(name="tasks.text_query_task")
def text_query_task(question: str):
print("Importing text_query...")
@ -140,11 +128,6 @@ def video_query_task(file_path: str, question: str):
# return run_video_inference(preprocessed_data)
# @celery.task(name="tasks.test_task")
# def test_task():
# return "Celery is working!"
import mimetypes
from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video
@ -342,7 +325,7 @@ def preprocess_video(video_path, question):
# for preprocessed_data in preprocessed_results:
# video_path = preprocessed_data["video_path"]
# question = preprocessed_data["question"]
# segments = preprocessed_data["processed_segments"]
# segments = preprocessed_data["processed_data"]
# print(f"Inferencing video: {video_path}")

View File

@ -1,92 +0,0 @@
import torch
import os
from whisper import load_model
from pydub import AudioSegment
def extract_audio_from_video(video_path: str) -> str:
audio = AudioSegment.from_file(video_path)
audio_path = "/tmp/temp_audio_test.wav"
audio.export(audio_path, format="wav")
print("video extracted!")
return audio_path
# def transcribe_audio(audio_path: str) -> str:
# print("Loading model in transcribe_audio...")
# from transformers import WhisperProcessor, WhisperForConditionalGeneration
# import torch
# # Load processor and model from transformers
# processor = WhisperProcessor.from_pretrained("openai/whisper-base")
# model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
# if torch.cuda.device_count() > 1:
# print(f"Using {torch.cuda.device_count()} GPUs!")
# model = torch.nn.DataParallel(model)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# print("Model loaded successfully.")
# print(audio_path)
# # Load and process the audio file
# import librosa
# audio_input, sr = librosa.load(audio_path, sr=16000)
# input_features = processor(audio_input, sampling_rate=sr, return_tensors="pt").input_features.to(device)
# # Generate transcription
# with torch.no_grad():
# if isinstance(model, torch.nn.DataParallel):
# generated_ids = model.module.generate(input_features)
# else:
# generated_ids = model.generate(input_features)
# # Decode the generated tokens to text
# transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# return transcription
def transcribe_audio(audio_path: str) -> str:
print("Loading model in transcribe_audio...")
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
# Load processor and model from transformers
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
if torch.cuda.device_count() > 1:
print(f"Using {torch.cuda.device_count()} GPUs!")
model = torch.nn.DataParallel(model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model loaded successfully.")
print(audio_path)
# Load and process the audio file
import librosa
audio_input, sr = librosa.load(audio_path, sr=16000)
input_features = processor(audio_input, sampling_rate=sr, return_tensors="pt").input_features.to(device)
# Generate transcription
with torch.no_grad():
if isinstance(model, torch.nn.DataParallel):
generated_ids = model.module.generate(input_features)
else:
generated_ids = model.generate(input_features)
# Decode the generated tokens to text
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return transcription
if __name__ == "__main__":
extract_audio_from_video("../video/1.mp4")
audio_file = "/tmp/temp_audio_test.wav"
for i in range(3):
print(f"\nTranscription attempt {i + 1}:")
transcription = transcribe_audio(audio_file)
print("Transcription:")
print(transcription)