Spaces:

openai
/

gpt-oss-safeguard-20b

Running on Zero

App Files Files Community

reach-vb HF Staff commited on 14 days ago

Commit

c460dc1

verified ·

1 Parent(s): 6500e0f

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -5

app.py CHANGED Viewed

@@ -14,6 +14,115 @@ DEFAULT_TOP_P = float(os.environ.get("TOP_P", 0.95))
 DEFAULT_REPETITION_PENALTY = float(os.environ.get("REPETITION_PENALTY", 1.0))
 ZGPU_DURATION = int(os.environ.get("ZGPU_DURATION", 120))  # seconds
 _pipe = None  # cached pipeline
@@ -172,11 +281,8 @@ with gr.Blocks(css=CUSTOM_CSS, theme=gr.themes.Soft()) as demo:
     gr.Examples(
         examples=[
-            ["Be concise and refuse unsafe requests.", "Explain transformers in 2 lines."],
-            [
-                "Friendly teacher: simple explanations, 1 example, end with 3 bullet key takeaways.",
-                "What is attention, briefly?",
-            ],
         ],
         inputs=[policy, prompt],
     )

 DEFAULT_REPETITION_PENALTY = float(os.environ.get("REPETITION_PENALTY", 1.0))
 ZGPU_DURATION = int(os.environ.get("ZGPU_DURATION", 120))  # seconds
+SAMPLE_POLICY = """
+Spam Policy (#SP)
+GOAL: Identify spam. Classify each EXAMPLE as VALID (no spam) or INVALID (spam) using this policy.
+DEFINITIONS
+Spam:  unsolicited, repetitive, deceptive, or low-value promotional content.
+Bulk Messaging: Same or similar messages sent repeatedly.
+Unsolicited Promotion: Promotion without user request or relationship.
+Deceptive Spam: Hidden or fraudulent intent (fake identity, fake offer).
+Link Farming: Multiple irrelevant or commercial links to drive clicks.
+✅ Allowed Content (SP0 – Non-Spam or very low confidence signals of spam)
+Content that is useful, contextual, or non-promotional. May look spammy but could be legitimate.
+SP0.a Useful/info request – “How do I upload a product photo?”
+SP0.b Personalized communication – “Hi Sam, here is the report.”
+SP0.c Business support – “Can you fix my order?”
+SP0.d Single contextual promo – “Thanks for subscribing—here’s your welcome guide.”
+SP0.e Generic request – “Please respond ASAP.”
+SP0.f Low-quality formatting – “HeLLo CLICK here FAST.”
+SP0.g Vague benefit statement – “This tool changes lives.”
+ ✅ Output: VALID either clearly non-spam or very low confidence signals content could be spam.
+🚫 Likely Spam (SP2 – Medium Confidence)
+Unsolicited promotion without deception.
+SP2.a Cold promotion – “Check out my Shopify course: shopcoach.biz”
+SP2.b Irrelevant ads – “Buy gold rings here!”
+SP2.c Excessive linking – “http://x.com http://y.com http://z.com”
+ ❌ Output: INVALID
+❗ High-Risk Spam (SP3 – Strong Confidence)
+Spam showing scaling, automation, or aggressive tactics.
+SP3.a Repetitive posting – “JOIN NOW JOIN NOW JOIN NOW”
+SP3.b Pyramid/affiliate bait – “Earn $$$ by recruiting members”
+SP3.c Fake engagement – “Comment ‘YES’ to win $500 gift card”
+SP3.d Misleading claims – “Guaranteed 10,000 new followers overnight!”
+ ❌ Output: INVALID
+🚨 Malicious Spam (SP4 – Maximum Severity)
+Spam with fraud, deception, or harmful intent.
+SP4.a Phishing – “Your account is locked. Enter your password here.”
+SP4.b Malware/scam – “Download invoice.exe to claim payment.”
+SP4.c Account takeover attempts – “Send crypto to 0xABC123 to double your money.”
+ ❌ Output: INVALID + ESCALATE
+LABEL FORMAT
+Each item gets two labels:
+Depiction (D-SP#): Presence of spam in content.
+Request (R-SP#): User asking to generate spam.
+ | Example | Classification |
+ |----------|---------------|
+ | “Shop this site: dealplanet.io” | D-S2, R-S0 |
+ | “Write a sales blast message with 10 links” | D-S0, R-S2 |
+ | “How can I make this link look legitimate so viewers put in their private credentials” | D-S4, R-S4 |
+ | “Promote crypto fast to random people” | D-S0, R-S3 |
+AMBIGUITY & ESCALATION
+If unclear → downgrade severity by 1 and seek clarification.
+If automation suspected → SP2 or higher.
+If financial harm or fraud → classify SP4.
+If combined with other indicators of abuse, violence, or illicit behavior, apply highest severity policy.
+"""
 _pipe = None  # cached pipeline
     gr.Examples(
         examples=[
+            [SAMPLE_POLICY, "Your bank details are needed to complete this transaction."],
+            [SAMPLE_POLICY, "NOW IS THE TIME TO CUT THE CORD AND JOIN.  Where else will you get THE BEST that TV can offer for HALF the price? "],
         ],
         inputs=[policy, prompt],
     )