Spaces:

nightey3s
/

profanity-detection

Running on Zero

App Files Files Community

nightey3s commited on Mar 16

Commit

8cf5bd1

unverified ·

1 Parent(s): 984bc80

Fix compatability for ZeroGPU

Browse files

Files changed (3) hide show

profanity_detector.py +163 -78
requirements.txt +2 -1
temp_tts_output_1742102180.wav +0 -0

profanity_detector.py CHANGED Viewed

@@ -24,19 +24,12 @@ logging.basicConfig(
 )
 logger = logging.getLogger('profanity_detector')
-# ZeroGPU COMPATIBILITY NOTES:
-# The @spaces.GPU decorators throughout this code enable compatibility with Hugging Face ZeroGPU.
-# - They request GPU resources only when needed and release them after function completion
-# - They have no effect when running in local environments or standard GPU Spaces
-# - Custom durations can be specified for functions requiring longer processing times
-# - For local development, you'll need: pip install huggingface_hub[spaces]
 # Detect if we're running in a ZeroGPU environment
 IS_ZEROGPU = os.environ.get("SPACE_RUNTIME_STATELESS", "0") == "1"
 # Define device strategy that works in both environments
 if IS_ZEROGPU:
-    # In ZeroGPU: initialize on CPU, will use GPU only in @spaces.GPU functions
     device = torch.device("cpu")
     logger.info("ZeroGPU environment detected. Using CPU for initial loading.")
 else:
@@ -44,10 +37,6 @@ else:
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     logger.info(f"Local environment. Using device: {device}")
-# Define device at the top of the script (global scope)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-logger.info(f"Using device: {device}")
 # Global variables for models
 profanity_model = None
 profanity_tokenizer = None
@@ -77,79 +66,73 @@ def load_models():
         profanity_tokenizer = AutoTokenizer.from_pretrained(PROFANITY_MODEL)
         # Load model without moving to CUDA directly
-        if IS_ZEROGPU:
-            logger.info("ZeroGPU mode: Loading model without CUDA initialization")
-            # For ZeroGPU, use device_map='auto' or just stay on CPU
-            profanity_model = AutoModelForSequenceClassification.from_pretrained(
-                PROFANITY_MODEL,
-                device_map=None,  # Explicitly stay on CPU
-                low_cpu_mem_usage=True
-            )
-        else:
-            # For local runs, normal loading with CUDA if available
-            profanity_model = AutoModelForSequenceClassification.from_pretrained(PROFANITY_MODEL)
-            if torch.cuda.is_available():
-                profanity_model = profanity_model.to(device)
-                try:
-                    profanity_model = profanity_model.half()
-                    logger.info("Successfully converted profanity model to half precision")
-                except Exception as e:
-                    logger.warning(f"Could not convert to half precision: {str(e)}")
-        # Apply similar changes to all other model loading...
         logger.info("Loading detoxification model...")
         T5_MODEL = "s-nlp/t5-paranmt-detox"
         t5_tokenizer = AutoTokenizer.from_pretrained(T5_MODEL)
-        if IS_ZEROGPU:
-            t5_model = AutoModelForSeq2SeqLM.from_pretrained(
-                T5_MODEL,
-                device_map=None,
-                low_cpu_mem_usage=True
-            )
-        else:
-            t5_model = AutoModelForSeq2SeqLM.from_pretrained(T5_MODEL)
-            if torch.cuda.is_available():
-                t5_model = t5_model.to(device)
-                try:
-                    t5_model = t5_model.half()
-                    logger.info("Successfully converted T5 model to half precision")
-                except Exception as e:
-                    logger.warning(f"Could not convert to half precision: {str(e)}")
         logger.info("Loading Whisper speech-to-text model...")
-        if IS_ZEROGPU:
-            # For ZeroGPU, stay on CPU in the main process
-            whisper_model = whisper.load_model("medium", device="cpu")
-        else:
-            whisper_model = whisper.load_model("large")
-            if torch.cuda.is_available():
-                whisper_model = whisper_model.to(device)
         logger.info("Loading Text-to-Speech model...")
         TTS_MODEL = "microsoft/speecht5_tts"
         tts_processor = SpeechT5Processor.from_pretrained(TTS_MODEL)
-        if IS_ZEROGPU:
-            tts_model = SpeechT5ForTextToSpeech.from_pretrained(
-                TTS_MODEL,
-                device_map=None,
-                low_cpu_mem_usage=True
-            )
-            vocoder = SpeechT5HifiGan.from_pretrained(
-                "microsoft/speecht5_hifigan",
-                device_map=None,
-                low_cpu_mem_usage=True
-            )
-        else:
-            tts_model = SpeechT5ForTextToSpeech.from_pretrained(TTS_MODEL)
-            vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-            if torch.cuda.is_available():
-                tts_model = tts_model.to(device)
-                vocoder = vocoder.to(device)
         # Speaker embeddings - always on CPU for ZeroGPU
         speaker_embeddings = torch.zeros((1, 512))
         if not IS_ZEROGPU and torch.cuda.is_available():
             speaker_embeddings = speaker_embeddings.to(device)
@@ -182,8 +165,17 @@ def detect_profanity(text: str, threshold: float = 0.5):
     try:
         # Detect profanity and score
         inputs = profanity_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
-        if torch.cuda.is_available():
-            inputs = inputs.to(device)
         with torch.no_grad():
             outputs = profanity_model(**inputs).logits
@@ -201,7 +193,7 @@ def detect_profanity(text: str, threshold: float = 0.5):
                 word_inputs = profanity_tokenizer(word, return_tensors="pt", truncation=True, max_length=512)
                 if torch.cuda.is_available():
-                    word_inputs = word_inputs.to(device)
                 with torch.no_grad():
                     word_outputs = profanity_model(**word_inputs).logits
@@ -211,6 +203,10 @@ def detect_profanity(text: str, threshold: float = 0.5):
                 if word_score > threshold:
                     profane_words.append(word.lower())
         # Create highlighted version of the text
         highlighted_text = create_highlighted_text(text, profane_words)
@@ -225,6 +221,12 @@ def detect_profanity(text: str, threshold: float = 0.5):
     except Exception as e:
         error_msg = f"Error in profanity detection: {str(e)}"
         logger.error(error_msg)
         return {"error": error_msg, "text": text, "score": 0, "profanity": False}
 def create_highlighted_text(text, profane_words):
@@ -255,8 +257,16 @@ def rephrase_profanity(text):
     try:
         # Rephrase using the detoxification model
         inputs = t5_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
-        if torch.cuda.is_available():
-            inputs = inputs.to(device)
         # Use more conservative generation settings with error handling
         try:
@@ -275,6 +285,10 @@ def rephrase_profanity(text):
                 logger.warning(f"T5 model produced unusable output: '{rephrased_text}'")
                 return text  # Return original if output is too short
             return rephrased_text.strip()
         except RuntimeError as e:
@@ -289,6 +303,11 @@ def rephrase_profanity(text):
                     early_stopping=True
                 )
                 rephrased_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
                 return rephrased_text.strip()
             else:
                 raise e  # Re-raise if it's not a memory issue
@@ -296,6 +315,12 @@ def rephrase_profanity(text):
     except Exception as e:
         error_msg = f"Error in rephrasing: {str(e)}"
         logger.error(error_msg)
         return text  # Return original text if rephrasing fails
 @spaces.GPU
@@ -312,19 +337,37 @@ def text_to_speech(text):
         # Process the text input
         inputs = tts_processor(text=text, return_tensors="pt")
-        if torch.cuda.is_available():
-            inputs = inputs.to(device)
         # Generate speech with a fixed speaker embedding
         speech = tts_model.generate_speech(
             inputs["input_ids"],
-            speaker_embeddings,
             vocoder=vocoder
         )
         # Convert from PyTorch tensor to NumPy array
         speech_np = speech.cpu().numpy()
         # Save as WAV file (sampling rate is 16kHz for SpeechT5)
         write_wav(temp_file, 16000, speech_np)
@@ -332,6 +375,13 @@ def text_to_speech(text):
     except Exception as e:
         error_msg = f"Error in text-to-speech conversion: {str(e)}"
         logger.error(error_msg)
         return None
 def text_analysis(input_text, threshold=0.5):
@@ -402,10 +452,19 @@ def analyze_audio(audio_path, threshold=0.5):
         return "No audio provided.", None, None
     try:
         # Transcribe audio
         result = whisper_model.transcribe(audio_path, fp16=torch.cuda.is_available())
         text = result["text"]
         # Detect profanity with user-defined threshold
         analysis = detect_profanity(text, threshold=threshold)
@@ -432,6 +491,12 @@ def analyze_audio(audio_path, threshold=0.5):
     except Exception as e:
         error_msg = f"Error in audio analysis: {str(e)}\n{traceback.format_exc()}"
         logger.error(error_msg)
         return error_msg, None, None
 # Global variables to store streaming results
@@ -497,10 +562,19 @@ def process_stream_chunk(audio_chunk):
             stream_results["profanity_info"] = "Error: Failed to create audio file for processing"
             return stream_results["transcript"], stream_results["profanity_info"], stream_results["clean_text"], stream_results["audio_output"]
         # Process with Whisper
         result = whisper_model.transcribe(temp_file, fp16=torch.cuda.is_available())
         transcript = result["text"].strip()
         # Skip processing if transcript is empty
         if not transcript:
             # Clean up temp file if we created it
@@ -554,6 +628,17 @@ def process_stream_chunk(audio_chunk):
         error_msg = f"Error processing streaming audio: {str(e)}\n{traceback.format_exc()}"
         logger.error(error_msg)
         # Update profanity info with error message
         stream_results["profanity_info"] = f"Error: {str(e)}"

 )
 logger = logging.getLogger('profanity_detector')
 # Detect if we're running in a ZeroGPU environment
 IS_ZEROGPU = os.environ.get("SPACE_RUNTIME_STATELESS", "0") == "1"
 # Define device strategy that works in both environments
 if IS_ZEROGPU:
+    # In ZeroGPU: always initialize on CPU, will use GPU only in @spaces.GPU functions
     device = torch.device("cpu")
     logger.info("ZeroGPU environment detected. Using CPU for initial loading.")
 else:
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     logger.info(f"Local environment. Using device: {device}")
 # Global variables for models
 profanity_model = None
 profanity_tokenizer = None
         profanity_tokenizer = AutoTokenizer.from_pretrained(PROFANITY_MODEL)
         # Load model without moving to CUDA directly
+        profanity_model = AutoModelForSequenceClassification.from_pretrained(
+            PROFANITY_MODEL,
+            device_map=None,  # Stay on CPU for now
+            low_cpu_mem_usage=True
+        )
+        # Only move to device if NOT in ZeroGPU mode
+        if not IS_ZEROGPU and torch.cuda.is_available():
+            profanity_model = profanity_model.to(device)
+            try:
+                profanity_model = profanity_model.half()
+                logger.info("Successfully converted profanity model to half precision")
+            except Exception as e:
+                logger.warning(f"Could not convert to half precision: {str(e)}")
         logger.info("Loading detoxification model...")
         T5_MODEL = "s-nlp/t5-paranmt-detox"
         t5_tokenizer = AutoTokenizer.from_pretrained(T5_MODEL)
+        t5_model = AutoModelForSeq2SeqLM.from_pretrained(
+            T5_MODEL,
+            device_map=None,  # Stay on CPU for now
+            low_cpu_mem_usage=True
+        )
+        # Only move to device if NOT in ZeroGPU mode
+        if not IS_ZEROGPU and torch.cuda.is_available():
+            t5_model = t5_model.to(device)
+            try:
+                t5_model = t5_model.half()
+                logger.info("Successfully converted T5 model to half precision")
+            except Exception as e:
+                logger.warning(f"Could not convert to half precision: {str(e)}")
         logger.info("Loading Whisper speech-to-text model...")
+        # Always load on CPU in ZeroGPU mode
+        #whisper_model = whisper.load_model("medium" if IS_ZEROGPU else "large", device="cpu")
+        whisper_model = whisper.load_model("large-v2", device="cpu")
+        # Only move to device if NOT in ZeroGPU mode
+        if not IS_ZEROGPU and torch.cuda.is_available():
+            whisper_model = whisper_model.to(device)
         logger.info("Loading Text-to-Speech model...")
         TTS_MODEL = "microsoft/speecht5_tts"
         tts_processor = SpeechT5Processor.from_pretrained(TTS_MODEL)
+        tts_model = SpeechT5ForTextToSpeech.from_pretrained(
+            TTS_MODEL,
+            device_map=None,  # Stay on CPU for now
+            low_cpu_mem_usage=True
+        )
+        vocoder = SpeechT5HifiGan.from_pretrained(
+            "microsoft/speecht5_hifigan",
+            device_map=None,  # Stay on CPU for now
+            low_cpu_mem_usage=True
+        )
+        # Only move to device if NOT in ZeroGPU mode
+        if not IS_ZEROGPU and torch.cuda.is_available():
+            tts_model = tts_model.to(device)
+            vocoder = vocoder.to(device)
         # Speaker embeddings - always on CPU for ZeroGPU
         speaker_embeddings = torch.zeros((1, 512))
+        # Only move to device if NOT in ZeroGPU mode
         if not IS_ZEROGPU and torch.cuda.is_available():
             speaker_embeddings = speaker_embeddings.to(device)
     try:
         # Detect profanity and score
         inputs = profanity_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+        # In ZeroGPU, move to GPU here inside the spaces.GPU function
+        # For local environments, it might already be on the correct device
+        current_device = device
+        if IS_ZEROGPU and torch.cuda.is_available():
+            current_device = torch.device("cuda")
+            inputs = inputs.to(current_device)
+            # Only in ZeroGPU mode, we need to move the model to GPU inside the function
+            profanity_model.to(current_device)
+        elif torch.cuda.is_available():  # Local environment with CUDA
+            inputs = inputs.to(current_device)
         with torch.no_grad():
             outputs = profanity_model(**inputs).logits
                 word_inputs = profanity_tokenizer(word, return_tensors="pt", truncation=True, max_length=512)
                 if torch.cuda.is_available():
+                    word_inputs = word_inputs.to(current_device)
                 with torch.no_grad():
                     word_outputs = profanity_model(**word_inputs).logits
                 if word_score > threshold:
                     profane_words.append(word.lower())
+        # Move model back to CPU if in ZeroGPU mode - to free GPU memory
+        if IS_ZEROGPU and torch.cuda.is_available():
+            profanity_model.to(torch.device("cpu"))
         # Create highlighted version of the text
         highlighted_text = create_highlighted_text(text, profane_words)
     except Exception as e:
         error_msg = f"Error in profanity detection: {str(e)}"
         logger.error(error_msg)
+        # Make sure model is on CPU if in ZeroGPU mode - to free GPU memory
+        if IS_ZEROGPU and torch.cuda.is_available():
+            try:
+                profanity_model.to(torch.device("cpu"))
+            except:
+                pass
         return {"error": error_msg, "text": text, "score": 0, "profanity": False}
 def create_highlighted_text(text, profane_words):
     try:
         # Rephrase using the detoxification model
         inputs = t5_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+        # In ZeroGPU, move to GPU here inside the spaces.GPU function
+        current_device = device
+        if IS_ZEROGPU and torch.cuda.is_available():
+            current_device = torch.device("cuda")
+            inputs = inputs.to(current_device)
+            # Only in ZeroGPU mode, we need to move the model to GPU inside the function
+            t5_model.to(current_device)
+        elif torch.cuda.is_available():  # Local environment with CUDA
+            inputs = inputs.to(current_device)
         # Use more conservative generation settings with error handling
         try:
                 logger.warning(f"T5 model produced unusable output: '{rephrased_text}'")
                 return text  # Return original if output is too short
+            # Move model back to CPU if in ZeroGPU mode - to free GPU memory
+            if IS_ZEROGPU and torch.cuda.is_available():
+                t5_model.to(torch.device("cpu"))
             return rephrased_text.strip()
         except RuntimeError as e:
                     early_stopping=True
                 )
                 rephrased_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
+                # Move model back to CPU if in ZeroGPU mode - to free GPU memory
+                if IS_ZEROGPU and torch.cuda.is_available():
+                    t5_model.to(torch.device("cpu"))
                 return rephrased_text.strip()
             else:
                 raise e  # Re-raise if it's not a memory issue
     except Exception as e:
         error_msg = f"Error in rephrasing: {str(e)}"
         logger.error(error_msg)
+        # Make sure model is on CPU if in ZeroGPU mode - to free GPU memory
+        if IS_ZEROGPU and torch.cuda.is_available():
+            try:
+                t5_model.to(torch.device("cpu"))
+            except:
+                pass
         return text  # Return original text if rephrasing fails
 @spaces.GPU
         # Process the text input
         inputs = tts_processor(text=text, return_tensors="pt")
+        # In ZeroGPU, move to GPU here inside the spaces.GPU function
+        current_device = device
+        if IS_ZEROGPU and torch.cuda.is_available():
+            current_device = torch.device("cuda")
+            inputs = inputs.to(current_device)
+            # Only in ZeroGPU mode, we need to move the models to GPU inside the function
+            tts_model.to(current_device)
+            vocoder.to(current_device)
+            speaker_embeddings_local = speaker_embeddings.to(current_device)
+        elif torch.cuda.is_available():  # Local environment with CUDA
+            inputs = inputs.to(current_device)
+            speaker_embeddings_local = speaker_embeddings
+        else:
+            speaker_embeddings_local = speaker_embeddings
         # Generate speech with a fixed speaker embedding
         speech = tts_model.generate_speech(
             inputs["input_ids"],
+            speaker_embeddings_local,
             vocoder=vocoder
         )
         # Convert from PyTorch tensor to NumPy array
         speech_np = speech.cpu().numpy()
+        # Move models back to CPU if in ZeroGPU mode - to free GPU memory
+        if IS_ZEROGPU and torch.cuda.is_available():
+            tts_model.to(torch.device("cpu"))
+            vocoder.to(torch.device("cpu"))
         # Save as WAV file (sampling rate is 16kHz for SpeechT5)
         write_wav(temp_file, 16000, speech_np)
     except Exception as e:
         error_msg = f"Error in text-to-speech conversion: {str(e)}"
         logger.error(error_msg)
+        # Make sure models are on CPU if in ZeroGPU mode - to free GPU memory
+        if IS_ZEROGPU and torch.cuda.is_available():
+            try:
+                tts_model.to(torch.device("cpu"))
+                vocoder.to(torch.device("cpu"))
+            except:
+                pass
         return None
 def text_analysis(input_text, threshold=0.5):
         return "No audio provided.", None, None
     try:
+        # In ZeroGPU mode, models need to be moved to GPU
+        if IS_ZEROGPU and torch.cuda.is_available():
+            current_device = torch.device("cuda")
+            whisper_model.to(current_device)
         # Transcribe audio
         result = whisper_model.transcribe(audio_path, fp16=torch.cuda.is_available())
         text = result["text"]
+        # Move whisper model back to CPU if in ZeroGPU mode
+        if IS_ZEROGPU and torch.cuda.is_available():
+            whisper_model.to(torch.device("cpu"))
         # Detect profanity with user-defined threshold
         analysis = detect_profanity(text, threshold=threshold)
     except Exception as e:
         error_msg = f"Error in audio analysis: {str(e)}\n{traceback.format_exc()}"
         logger.error(error_msg)
+        # Make sure models are on CPU if in ZeroGPU mode
+        if IS_ZEROGPU and torch.cuda.is_available():
+            try:
+                whisper_model.to(torch.device("cpu"))
+            except:
+                pass
         return error_msg, None, None
 # Global variables to store streaming results
             stream_results["profanity_info"] = "Error: Failed to create audio file for processing"
             return stream_results["transcript"], stream_results["profanity_info"], stream_results["clean_text"], stream_results["audio_output"]
+        # In ZeroGPU mode, move whisper model to GPU
+        if IS_ZEROGPU and torch.cuda.is_available():
+            current_device = torch.device("cuda")
+            whisper_model.to(current_device)
         # Process with Whisper
         result = whisper_model.transcribe(temp_file, fp16=torch.cuda.is_available())
         transcript = result["text"].strip()
+        # Move whisper model back to CPU if in ZeroGPU mode
+        if IS_ZEROGPU and torch.cuda.is_available():
+            whisper_model.to(torch.device("cpu"))
         # Skip processing if transcript is empty
         if not transcript:
             # Clean up temp file if we created it
         error_msg = f"Error processing streaming audio: {str(e)}\n{traceback.format_exc()}"
         logger.error(error_msg)
+        # Make sure all models are on CPU if in ZeroGPU mode
+        if IS_ZEROGPU and torch.cuda.is_available():
+            try:
+                whisper_model.to(torch.device("cpu"))
+                profanity_model.to(torch.device("cpu"))
+                t5_model.to(torch.device("cpu"))
+                tts_model.to(torch.device("cpu"))
+                vocoder.to(torch.device("cpu"))
+            except:
+                pass
         # Update profanity info with error message
         stream_results["profanity_info"] = f"Error: {str(e)}"

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ torch
 transformers
 pillow
 sentencepiece
-spaces

 transformers
 pillow
 sentencepiece
+spaces
+accelerate

temp_tts_output_1742102180.wav ADDED Viewed

Binary file (217 kB). View file