updated readme

2025-05-12 11:22:46 +08:00 · 2025-05-12 11:22:46 +08:00 · 7fb9de49b7
commit 7fb9de49b7
parent d956ce6678
13 changed files with 88 additions and 137 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,52 @@
-# api
+# API Usage Guide
 This project supports multiple methods to start and interact with the API.
 ## Method 1: Standard API Start
 Start the API using the main script:
 ```
 python main.py
 ```
 ## Method 2: Gradio UI
 To use the Gradio UI:
 1. In the `endpoints/` directory, change the comment to enable 'gradio'.
 2. Run the UI script:
 ```
 python ui.py
 ```
 ## Method 3: Celery Optimization
 Use Celery for optimized background task processing. You can adjust the parameters for concurrency as needed.
 ### Step 1: Start Redis Server
 ```
 redis-server
 ```
 ### Step 2: Start Celery Workers
 Open two separate terminals and run the following:
 **Terminal 1 (Preprocessing Queue):**
 ```
 celery -A tasks worker --pool=threads --loglevel=info --concurrency=2 --queues=preprocess_queue
 ```
 **Terminal 2 (Inference Queue):**
 ```
 celery -A tasks worker --pool=threads --loglevel=info --concurrency=3 --queues=inference_queue
 ```
 ### Step 3: Run Debug Script
 ```
 python celery_debug.py
 ```
--- a/pycache/main.cpython-311.pyc
+++ b/pycache/main.cpython-311.pyc
--- a/pycache/pipeline_setup.cpython-311.pyc
+++ b/pycache/pipeline_setup.cpython-311.pyc
--- a/pycache/tasks.cpython-311.pyc
+++ b/pycache/tasks.cpython-311.pyc
--- a/celery_debug.py
+++ b/celery_debug.py
@ -41,7 +41,7 @@ print("Broker:", app.conf.broker_url)
 # Define the number of concurrent tasks
 NUM_TASKS = 1
 delay_seconds = 0
-file_paths = [f"../video/film4.mp4" for _ in range(NUM_TASKS)]
+file_paths = [f"../video/3.mp4" for _ in range(NUM_TASKS)]
 # video_folder = "../video"
@ -71,29 +71,29 @@ file_paths = [f"../video/film4.mp4" for _ in range(NUM_TASKS)]
 #     for i in range(NUM_TASKS)
 # ]
 # questions = [
 #     f"Based on the given images and audio script, extract detailed information about the products recommended in the video and format the output as JSON with the following fields:\n"
 #     f"- Product Name: <name>\n"
 #     f"- Category: <category>\n"
 #     f"- Styles or Variants: <styles/variants>\n"
 #     f"- Highlights: <highlights>\n"
 #     f"- Promotional Details: <promotional_details>\n"
 #     f"Do not include any disclaimers or comments like 'I'm sorry' or 'I can't assist.' Task {i}"
 #     for i in range(NUM_TASKS)
 # ]
 questions = [
-    f"Generate a screenplay based on the given video content and format the output as JSON with the following structured fields:\n"
+    f"Based on the given images and audio script, extract detailed information about the products recommended in the video and format the output as JSON with the following fields:\n"
-    f"- Scene Descriptions: <visual setting including background, lighting, atmosphere>\n"
+    f"- Product Name: <name>\n"
-    f"- Character Introductions: <key characters with appearance and notable traits>\n" 
+    f"- Category: <category>\n"
-    f"- Dialogue: <transcribed spoken lines in screenplay format>\n"
+    f"- Styles or Variants: <styles/variants>\n"
-    f"- Actions & Expressions: <non-verbal cues and interactions>\n"
+    f"- Highlights: <highlights>\n"
-    f"- Product Integrations: <product names, categories, features if applicable>\n"
+    f"- Promotional Details: <promotional_details>\n"
-    f"- Narrative Flow: <scene transitions and pacing notes>\n"
+    f"Do not include any disclaimers or comments like 'I'm sorry' or 'I can't assist.' Task {i}"
    f"Follow standard screenplay formatting for headers, character names, dialogue, and actions. Do not include disclaimers or comments like 'I can't assist.' Task {i}"
    for i in range(NUM_TASKS)
 ]
 # questions = [
 #     f"Generate a screenplay based on the given video content and format the output as JSON with the following structured fields:\n"
 #     f"- Scene Descriptions: <visual setting including background, lighting, atmosphere>\n"
 #     f"- Character Introductions: <key characters with appearance and notable traits>\n" 
 #     f"- Dialogue: <transcribed spoken lines in screenplay format>\n"
 #     f"- Actions & Expressions: <non-verbal cues and interactions>\n"
 #     f"- Product Integrations: <product names, categories, features if applicable>\n"
 #     f"- Narrative Flow: <scene transitions and pacing notes>\n"
 #     f"Follow standard screenplay formatting for headers, character names, dialogue, and actions. Do not include disclaimers or comments like 'I can't assist.' Task {i}"
 #     for i in range(NUM_TASKS)
 # ]
 # def submit_task(question, index):  # sends tasks to Celery asynchronously, queues the tasks in Celery broker. If multiple Celery workers, they process tasks in parallel.
 #     """ Submits a Celery task with increasing delay """
 #     countdown_time = index * delay_seconds  # Dynamic delay
--- a/endpoints/image.py
+++ b/endpoints/image.py
@ -8,6 +8,7 @@ from PIL import Image
 from pipeline_setup import pipe, IMAGE_TOKEN 
 from utils.image_processing import encode_image_base64
 # api
 async def image_query(file: UploadFile, question: str = Form(...)):
    """
    API endpoint to process an image with the user's query.
@ -26,11 +27,8 @@ async def image_query(file: UploadFile, question: str = Form(...)):
    except Exception as e:
        return JSONResponse({"query": question, "error": str(e)})
-
+# gradio
 # async def image_query(image: np.ndarray, question: str):
 #     """
 #     API endpoint to process an image (as numpy array) with the user's query.
 #     """
 #     try:
 #         # Convert the numpy array to a PIL Image
 #         image = Image.fromarray(image).convert("RGB").resize((512, 512))
@ -49,7 +47,7 @@ async def image_query(file: UploadFile, question: str = Form(...)):
 #     except Exception as e:
 #         return {"query": question, "error": str(e)}
-
+# celery
 # def image_query(image_path: str, question: str):
 #     try:
 #         print("image_path in image_query...")
--- a/endpoints/video.py
+++ b/endpoints/video.py
@ -24,6 +24,7 @@ def load_checkpoint(video_id):
            return json.load(f)
    return None
 # api
 # async def video_query(file: UploadFile, question: str = Form(...)):
 #     try:
 #         print("Processing video...")
@ -131,7 +132,7 @@ def load_checkpoint(video_id):
 #     except Exception as e:
 #         return JSONResponse({"query": question, "error": str(e)})
-
+# gradio
 # async def video_query(video_path: str, question: str):
 #     """
 #     API endpoint to process a video file with the user's query.
--- a/prompt.txt
+++ b/prompt.txt
@ -0,0 +1,11 @@
 图片
 "Extract the following information from this image and return the result in JSON format:\n"                             "- Name: <name>\n"                             "- ID: <id>\n"                             "- Profile Picture: <url>\n"                             "- Follower Count: <count>\n"                             "- Likes Count: <count>\n"                             "- Bio: <bio>\n"                             "- Following Count: <count>\n"                             "- External Links: <links>\n"                             # "Provide no additional text other than the extracted information."                             "Do not include any disclaimers or comments like 'I'm sorry' or 'I can't assist'."
 视频
 “Based on the given images and audio script, extract detailed information about the products recommended in the video and format the output as JSON with the following fields:
 1. **Product Name**: The specific name of the product, if mentioned.
 2. **Category**: The specific category of the product (e.g., electronics, skincare, casual wear, etc.).
 3. **Styles or Variants**: Any styles, designs, or variants of the product described (e.g., colors, patterns, sizes, or other distinguishing attributes).
 4. **Highlights**: The unique selling points or notable features emphasized by the anchor (e.g., benefits, quality, or standout features).
 5. **Promotional Details**: Any additional promotional information mentioned, such as discounts, offers, or key features that set the product apart.”
--- a/tasks.py
+++ b/tasks.py
@ -6,8 +6,6 @@ mp.set_start_method("spawn", force=True)
 # mp.set_start_method("fork", force=True)
 from celery import Celery
 import psutil
 from pynvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetUtilizationRates
 # from endpoints.video import run_video_inference
@ -54,11 +52,6 @@ app.conf.worker_prefetch_multiplier = 1
 #     include=["tasks"]
 # )
 # app.conf.task_routes = {
 #     'tasks.preprocess_video': {'queue': 'preprocess_queue'},
 #     'tasks.inference_video': {'queue': 'inference_queue'},
 # }
 app.conf.task_routes = {
    'tasks.preprocess_video': {'queue': 'preprocess_queue'},
    'tasks.inference_video': {'queue': 'inference_queue'},
@ -93,11 +86,6 @@ app.conf.task_routes = {
 # celery.conf.task_time_limit = 60  # 60 seconds max execution time
 # celery.conf.task_soft_time_limit = 50  # Warn at 50 seconds
@app.task
 def add(x, y):
    print("Adding task...")
    return x + y
@app.task(name="tasks.text_query_task")
 def text_query_task(question: str):
    print("Importing text_query...")
@ -140,11 +128,6 @@ def video_query_task(file_path: str, question: str):
 #     return run_video_inference(preprocessed_data)
 # @celery.task(name="tasks.test_task")
 # def test_task():
 #     return "Celery is working!"
 import mimetypes
 from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video
@ -342,7 +325,7 @@ def preprocess_video(video_path, question):
 #         for preprocessed_data in preprocessed_results:
 #             video_path = preprocessed_data["video_path"]
 #             question = preprocessed_data["question"]
-#             segments = preprocessed_data["processed_segments"]
+#             segments = preprocessed_data["processed_data"]
 #             print(f"Inferencing video: {video_path}")
--- a/test_audio.py
+++ b/test_audio.py
@ -1,92 +0,0 @@
 import torch
 import os
 from whisper import load_model
 from pydub import AudioSegment
 def extract_audio_from_video(video_path: str) -> str:
    audio = AudioSegment.from_file(video_path)
    audio_path = "/tmp/temp_audio_test.wav"
    audio.export(audio_path, format="wav")
    print("video extracted!")
    return audio_path
 # def transcribe_audio(audio_path: str) -> str:
 #     print("Loading model in transcribe_audio...")
 #     from transformers import WhisperProcessor, WhisperForConditionalGeneration
 #     import torch
 #     # Load processor and model from transformers
 #     processor = WhisperProcessor.from_pretrained("openai/whisper-base")
 #     model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
 #     if torch.cuda.device_count() > 1:
 #         print(f"Using {torch.cuda.device_count()} GPUs!")
 #         model = torch.nn.DataParallel(model)
 #     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 #     model.to(device)
 #     print("Model loaded successfully.")
 #     print(audio_path)
 #     # Load and process the audio file
 #     import librosa
 #     audio_input, sr = librosa.load(audio_path, sr=16000)
 #     input_features = processor(audio_input, sampling_rate=sr, return_tensors="pt").input_features.to(device)
 #     # Generate transcription
 #     with torch.no_grad():
 #         if isinstance(model, torch.nn.DataParallel):
 #             generated_ids = model.module.generate(input_features)
 #         else:
 #             generated_ids = model.generate(input_features)
 #     # Decode the generated tokens to text
 #     transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 #     return transcription
 def transcribe_audio(audio_path: str) -> str:
    print("Loading model in transcribe_audio...")
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
    from transformers import WhisperProcessor, WhisperForConditionalGeneration
    import torch
    # Load processor and model from transformers
    processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs!")
        model = torch.nn.DataParallel(model)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print("Model loaded successfully.")
    print(audio_path)
    # Load and process the audio file
    import librosa
    audio_input, sr = librosa.load(audio_path, sr=16000)
    input_features = processor(audio_input, sampling_rate=sr, return_tensors="pt").input_features.to(device)
    # Generate transcription
    with torch.no_grad():
        if isinstance(model, torch.nn.DataParallel):
            generated_ids = model.module.generate(input_features)
        else:
            generated_ids = model.generate(input_features)
    # Decode the generated tokens to text
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return transcription
 if __name__ == "__main__":
    extract_audio_from_video("../video/1.mp4")
    audio_file = "/tmp/temp_audio_test.wav"
    for i in range(3):
        print(f"\nTranscription attempt {i + 1}:")
        transcription = transcribe_audio(audio_file)
        print("Transcription:")
        print(transcription)
--- a/utils/pycache/audio_transcription.cpython-311.pyc
+++ b/utils/pycache/audio_transcription.cpython-311.pyc
--- a/utils/pycache/image_processing.cpython-311.pyc
+++ b/utils/pycache/image_processing.cpython-311.pyc
--- a/utils/pycache/video_processing.cpython-311.pyc
+++ b/utils/pycache/video_processing.cpython-311.pyc