updated readme

2025-05-12 11:22:46 +08:00 · 2025-05-12 11:22:46 +08:00 · 7fb9de49b7
commit 7fb9de49b7
parent d956ce6678
13 changed files with 88 additions and 137 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,52 @@
-# api
+# API Usage Guide

+This project supports multiple methods to start and interact with the API.
+
+## Method 1: Standard API Start
+
+Start the API using the main script:
+
+```
+python main.py
+```
+
+## Method 2: Gradio UI
+
+To use the Gradio UI:
+
+1. In the `endpoints/` directory, change the comment to enable 'gradio'.
+2. Run the UI script:
+
+```
+python ui.py
+```
+
+## Method 3: Celery Optimization
+
+Use Celery for optimized background task processing. You can adjust the parameters for concurrency as needed.
+
+### Step 1: Start Redis Server
+
+```
+redis-server
+```
+
+### Step 2: Start Celery Workers
+
+Open two separate terminals and run the following:
+
+**Terminal 1 (Preprocessing Queue):**
+```
+celery -A tasks worker --pool=threads --loglevel=info --concurrency=2 --queues=preprocess_queue
+```
+
+**Terminal 2 (Inference Queue):**
+```
+celery -A tasks worker --pool=threads --loglevel=info --concurrency=3 --queues=inference_queue
+```
+
+### Step 3: Run Debug Script
+
+```
+python celery_debug.py
+```
--- a/pycache/main.cpython-311.pyc
+++ b/pycache/main.cpython-311.pyc
--- a/pycache/pipeline_setup.cpython-311.pyc
+++ b/pycache/pipeline_setup.cpython-311.pyc
--- a/pycache/tasks.cpython-311.pyc
+++ b/pycache/tasks.cpython-311.pyc
--- a/celery_debug.py
+++ b/celery_debug.py
@ -41,7 +41,7 @@ print("Broker:", app.conf.broker_url)
 # Define the number of concurrent tasks
 NUM_TASKS = 1
 delay_seconds = 0
-file_paths = [f"../video/film4.mp4" for _ in range(NUM_TASKS)]
+file_paths = [f"../video/3.mp4" for _ in range(NUM_TASKS)]


 # video_folder = "../video"
@ -71,29 +71,29 @@ file_paths = [f"../video/film4.mp4" for _ in range(NUM_TASKS)]
 #     for i in range(NUM_TASKS)
 # ]

-# questions = [
-#     f"Based on the given images and audio script, extract detailed information about the products recommended in the video and format the output as JSON with the following fields:\n"
-#     f"- Product Name: <name>\n"
-#     f"- Category: <category>\n"
-#     f"- Styles or Variants: <styles/variants>\n"
-#     f"- Highlights: <highlights>\n"
-#     f"- Promotional Details: <promotional_details>\n"
-#     f"Do not include any disclaimers or comments like 'I'm sorry' or 'I can't assist.' Task {i}"
-#     for i in range(NUM_TASKS)
-# ]
-
 questions = [
-    f"Generate a screenplay based on the given video content and format the output as JSON with the following structured fields:\n"
-    f"- Scene Descriptions: <visual setting including background, lighting, atmosphere>\n"
-    f"- Character Introductions: <key characters with appearance and notable traits>\n" 
-    f"- Dialogue: <transcribed spoken lines in screenplay format>\n"
-    f"- Actions & Expressions: <non-verbal cues and interactions>\n"
-    f"- Product Integrations: <product names, categories, features if applicable>\n"
-    f"- Narrative Flow: <scene transitions and pacing notes>\n"
-    f"Follow standard screenplay formatting for headers, character names, dialogue, and actions. Do not include disclaimers or comments like 'I can't assist.' Task {i}"
+    f"Based on the given images and audio script, extract detailed information about the products recommended in the video and format the output as JSON with the following fields:\n"
+    f"- Product Name: <name>\n"
+    f"- Category: <category>\n"
+    f"- Styles or Variants: <styles/variants>\n"
+    f"- Highlights: <highlights>\n"
+    f"- Promotional Details: <promotional_details>\n"
+    f"Do not include any disclaimers or comments like 'I'm sorry' or 'I can't assist.' Task {i}"
    for i in range(NUM_TASKS)
 ]

+# questions = [
+#     f"Generate a screenplay based on the given video content and format the output as JSON with the following structured fields:\n"
+#     f"- Scene Descriptions: <visual setting including background, lighting, atmosphere>\n"
+#     f"- Character Introductions: <key characters with appearance and notable traits>\n" 
+#     f"- Dialogue: <transcribed spoken lines in screenplay format>\n"
+#     f"- Actions & Expressions: <non-verbal cues and interactions>\n"
+#     f"- Product Integrations: <product names, categories, features if applicable>\n"
+#     f"- Narrative Flow: <scene transitions and pacing notes>\n"
+#     f"Follow standard screenplay formatting for headers, character names, dialogue, and actions. Do not include disclaimers or comments like 'I can't assist.' Task {i}"
+#     for i in range(NUM_TASKS)
+# ]
+
 # def submit_task(question, index):  # sends tasks to Celery asynchronously, queues the tasks in Celery broker. If multiple Celery workers, they process tasks in parallel.
 #     """ Submits a Celery task with increasing delay """
 #     countdown_time = index * delay_seconds  # Dynamic delay
--- a/endpoints/image.py
+++ b/endpoints/image.py
@ -8,6 +8,7 @@ from PIL import Image
 from pipeline_setup import pipe, IMAGE_TOKEN 
 from utils.image_processing import encode_image_base64

+# api
 async def image_query(file: UploadFile, question: str = Form(...)):
    """
    API endpoint to process an image with the user's query.
@ -26,11 +27,8 @@ async def image_query(file: UploadFile, question: str = Form(...)):
    except Exception as e:
        return JSONResponse({"query": question, "error": str(e)})

-
+# gradio
 # async def image_query(image: np.ndarray, question: str):
-#     """
-#     API endpoint to process an image (as numpy array) with the user's query.
-#     """
 #     try:
 #         # Convert the numpy array to a PIL Image
 #         image = Image.fromarray(image).convert("RGB").resize((512, 512))
@ -49,7 +47,7 @@ async def image_query(file: UploadFile, question: str = Form(...)):
 #     except Exception as e:
 #         return {"query": question, "error": str(e)}

-
+# celery
 # def image_query(image_path: str, question: str):
 #     try:
 #         print("image_path in image_query...")
--- a/endpoints/video.py
+++ b/endpoints/video.py
@ -24,6 +24,7 @@ def load_checkpoint(video_id):
            return json.load(f)
    return None

+# api
 # async def video_query(file: UploadFile, question: str = Form(...)):
 #     try:
 #         print("Processing video...")
@ -131,7 +132,7 @@ def load_checkpoint(video_id):
 #     except Exception as e:
 #         return JSONResponse({"query": question, "error": str(e)})

-
+# gradio
 # async def video_query(video_path: str, question: str):
 #     """
 #     API endpoint to process a video file with the user's query.
--- a/prompt.txt
+++ b/prompt.txt
@ -0,0 +1,11 @@
+图片
+"Extract the following information from this image and return the result in JSON format:\n"                             "- Name: <name>\n"                             "- ID: <id>\n"                             "- Profile Picture: <url>\n"                             "- Follower Count: <count>\n"                             "- Likes Count: <count>\n"                             "- Bio: <bio>\n"                             "- Following Count: <count>\n"                             "- External Links: <links>\n"                             # "Provide no additional text other than the extracted information."                             "Do not include any disclaimers or comments like 'I'm sorry' or 'I can't assist'."
+
+
+视频
+“Based on the given images and audio script, extract detailed information about the products recommended in the video and format the output as JSON with the following fields:
+1. **Product Name**: The specific name of the product, if mentioned.
+2. **Category**: The specific category of the product (e.g., electronics, skincare, casual wear, etc.).
+3. **Styles or Variants**: Any styles, designs, or variants of the product described (e.g., colors, patterns, sizes, or other distinguishing attributes).
+4. **Highlights**: The unique selling points or notable features emphasized by the anchor (e.g., benefits, quality, or standout features).
+5. **Promotional Details**: Any additional promotional information mentioned, such as discounts, offers, or key features that set the product apart.”
--- a/tasks.py
+++ b/tasks.py
@ -6,8 +6,6 @@ mp.set_start_method("spawn", force=True)
 # mp.set_start_method("fork", force=True)

 from celery import Celery
-
-import psutil
 from pynvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetUtilizationRates

 # from endpoints.video import run_video_inference
@ -54,11 +52,6 @@ app.conf.worker_prefetch_multiplier = 1
 #     include=["tasks"]
 # )

-# app.conf.task_routes = {
-#     'tasks.preprocess_video': {'queue': 'preprocess_queue'},
-#     'tasks.inference_video': {'queue': 'inference_queue'},
-# }
-
 app.conf.task_routes = {
    'tasks.preprocess_video': {'queue': 'preprocess_queue'},
    'tasks.inference_video': {'queue': 'inference_queue'},
@ -93,11 +86,6 @@ app.conf.task_routes = {
 # celery.conf.task_time_limit = 60  # 60 seconds max execution time
 # celery.conf.task_soft_time_limit = 50  # Warn at 50 seconds

-@app.task
-def add(x, y):
-    print("Adding task...")
-    return x + y
-
@app.task(name="tasks.text_query_task")
 def text_query_task(question: str):
    print("Importing text_query...")
@ -140,11 +128,6 @@ def video_query_task(file_path: str, question: str):
 #     return run_video_inference(preprocessed_data)


-# @celery.task(name="tasks.test_task")
-# def test_task():
-#     return "Celery is working!"
-
-

 import mimetypes
 from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video
@ -342,7 +325,7 @@ def preprocess_video(video_path, question):
 #         for preprocessed_data in preprocessed_results:
 #             video_path = preprocessed_data["video_path"]
 #             question = preprocessed_data["question"]
-#             segments = preprocessed_data["processed_segments"]
+#             segments = preprocessed_data["processed_data"]

 #             print(f"Inferencing video: {video_path}")

--- a/test_audio.py
+++ b/test_audio.py
@ -1,92 +0,0 @@
-import torch
-import os
-from whisper import load_model
-from pydub import AudioSegment
-
-def extract_audio_from_video(video_path: str) -> str:
-    audio = AudioSegment.from_file(video_path)
-    audio_path = "/tmp/temp_audio_test.wav"
-    audio.export(audio_path, format="wav")
-    print("video extracted!")
-    return audio_path
-
-# def transcribe_audio(audio_path: str) -> str:
-#     print("Loading model in transcribe_audio...")
-#     from transformers import WhisperProcessor, WhisperForConditionalGeneration
-#     import torch
-
-#     # Load processor and model from transformers
-#     processor = WhisperProcessor.from_pretrained("openai/whisper-base")
-#     model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
-
-#     if torch.cuda.device_count() > 1:
-#         print(f"Using {torch.cuda.device_count()} GPUs!")
-#         model = torch.nn.DataParallel(model)
-
-#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#     model.to(device)
-#     print("Model loaded successfully.")
-#     print(audio_path)
-
-#     # Load and process the audio file
-#     import librosa
-#     audio_input, sr = librosa.load(audio_path, sr=16000)
-#     input_features = processor(audio_input, sampling_rate=sr, return_tensors="pt").input_features.to(device)
-
-#     # Generate transcription
-#     with torch.no_grad():
-#         if isinstance(model, torch.nn.DataParallel):
-#             generated_ids = model.module.generate(input_features)
-#         else:
-#             generated_ids = model.generate(input_features)
-
-#     # Decode the generated tokens to text
-#     transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-#     return transcription
-
-def transcribe_audio(audio_path: str) -> str:
-    print("Loading model in transcribe_audio...")
-    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
-
-    from transformers import WhisperProcessor, WhisperForConditionalGeneration
-    import torch
-
-    # Load processor and model from transformers
-    processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-    if torch.cuda.device_count() > 1:
-        print(f"Using {torch.cuda.device_count()} GPUs!")
-        model = torch.nn.DataParallel(model)
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    print("Model loaded successfully.")
-    print(audio_path)
-
-    # Load and process the audio file
-    import librosa
-    audio_input, sr = librosa.load(audio_path, sr=16000)
-    input_features = processor(audio_input, sampling_rate=sr, return_tensors="pt").input_features.to(device)
-
-    # Generate transcription
-    with torch.no_grad():
-        if isinstance(model, torch.nn.DataParallel):
-            generated_ids = model.module.generate(input_features)
-        else:
-            generated_ids = model.generate(input_features)
-
-    # Decode the generated tokens to text
-    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return transcription
-
-if __name__ == "__main__":
-    extract_audio_from_video("../video/1.mp4")
-
-    audio_file = "/tmp/temp_audio_test.wav"
-
-    for i in range(3):
-        print(f"\nTranscription attempt {i + 1}:")
-        transcription = transcribe_audio(audio_file)
-        print("Transcription:")
-        print(transcription)
--- a/utils/pycache/audio_transcription.cpython-311.pyc
+++ b/utils/pycache/audio_transcription.cpython-311.pyc
--- a/utils/pycache/image_processing.cpython-311.pyc
+++ b/utils/pycache/image_processing.cpython-311.pyc
--- a/utils/pycache/video_processing.cpython-311.pyc
+++ b/utils/pycache/video_processing.cpython-311.pyc