Spaces:

Rathapoom
/

thai-ner-demo

Runtime error

App Files Files Community

Rathapoom commited on Sep 30

Commit

e1c1c45

verified ·

1 Parent(s): 6b4b766

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -60

app.py CHANGED Viewed

@@ -1,98 +1,112 @@
 import gradio as gr
 from transformers import pipeline
 import re
-# 1. โหลดโมเดล NER จาก Hugging Face
-# device=-1 หมายถึงให้ใช้ CPU ซึ่งเหมาะกับ Free tier ของ HF Spaces
 print("กำลังโหลดโมเดล...")
-ner_pipeline = pipeline("token-classification", model="loolootech/no-name-ner-th", device=-1)
 print("โมเดลพร้อมใช้งานแล้ว")
-# 2. ฟังก์ชันสำหรับรวม Token ที่อยู่ติดกัน (B-PERSON, I-PERSON -> PERSON)
 def merge_entities(ner_results):
     merged_entities = []
     current_entity = None
     for entity in ner_results:
-        # ลบ B- หรือ I- prefix ออกไปเพื่อให้ได้ประเภท entity ที่แท้จริง
         entity_type = re.sub(r'^[BI]-', '', entity['entity'])
         if current_entity and entity['start'] == current_entity['end'] and entity_type == current_entity['type']:
-            # ถ้า token นี้อยู่ติดกับ entity ก่อนหน้าและเป็นประเภทเดียวกัน ให้รวมกัน
             current_entity['word'] += entity['word']
             current_entity['end'] = entity['end']
             current_entity['score'] = max(current_entity['score'], entity['score'])
         else:
-            # ถ้าไม่ใช่ ให้เริ่มนับเป็น entity ใหม่
             if current_entity:
                 merged_entities.append(current_entity)
             current_entity = {
-                'type': entity_type,
-                'word': entity['word'],
-                'start': entity['start'],
-                'end': entity['end'],
-                'score': entity['score']
             }
-    # เพิ่ม entity สุดท้ายที่ค้างไว้
     if current_entity:
         merged_entities.append(current_entity)
     return merged_entities
-# 3. ฟังก์ชันหลักสำหรับ De-identification
-def deidentify_text(text):
-    if not text.strip():
-        return "กรุณาใส่ข้อความ", ""
-    # รัน NER pipeline
-    ner_results = ner_pipeline(text)
-    # รวม token ที่อยู่ติดกัน
     merged = merge_entities(ner_results)
-    # สร้างข้อความที่ไฮไลท์ entity ต่างๆ เพื่อให้เห็นภาพ
-    highlighted_text = ""
-    last_index = 0
-    for entity in merged:
-        start, end, label, word = entity['start'], entity['end'], entity['type'], entity['word']
-        # เพิ่มส่วนของข้อความที่ไม่ได้ถูกระบุว่าเป็น entity
-        highlighted_text += text[last_index:start]
-        # เพิ่มส่วนของ entity ที่ไฮไลท์
-        highlighted_text += f" <mark>{word}**[{label}]**</mark> "
-        last_index = end
-    # เพิ่มข้อความส่วนที่เหลือ
-    highlighted_text += text[last_index:]
-    # ทำการแทนที่ (Redaction) จากหลังมาหน้าเพื่อไม่ให้ index เพี้ยน
     redacted_text = text
     for entity in reversed(merged):
         start, end, label = entity['start'], entity['end'], entity['type']
         redacted_text = redacted_text[:start] + f"[{label}]" + redacted_text[end:]
-    return redacted_text, highlighted_text
-# 4. สร้างหน้าเว็บด้วย Gradio
 iface = gr.Interface(
-    fn=deidentify_text,
-    inputs=gr.Textbox(
-        lines=5,
-        label="ข้อความที่ต้องการตรวจสอบ (Input Text)",
-        placeholder="เช่น: คุณสมชายเป็นอะไรมาครับวันนี้ อ๋อวันนี้ปวดตับครับ งั้นวันนี้หมอขอตรวจละเอียดหน่อยนะ ได้เลยครับน้องมาร์ค"
-    ),
-    outputs=[
-        gr.Textbox(label="ข้อความที่ปกปิดข้อมูลแล้ว (Redacted Text)"),
-        gr.Markdown(label="ผลลัพธ์พร้อมไฮไลท์ (Highlighted Entities)")
     ],
-    title="👩‍⚕️ Thai Medical NER De-identification Demo",
-    description="ทดสอบโมเดล `loolootech/no-name-ner-th` สำหรับการตรวจจับและปกปิดข้อมูลส่วนบุคคลในข้อความภาษาไทยทางการแพทย์\n\n**Entity ที่รองรับ:** PERSON, PHONE, EMAIL, ADDRESS, DATE, NATIONAL_ID, HOSPITAL_IDS",
-    examples=[
-        ["คุณสมชายเป็นอะไรมาครับวันนี้ อ๋อวันนี้ปวดตับครับ งั้นวันนี้หมอขอตรวจละเอียดหน่อยนะ ได้เลยครับน้องมาร์ค"],
-        ["คนไข้ชื่อสมศรี มากี่โมง เบอร์โทร 081-234-5678 นัดตรวจวันที่ 15/10/2568"],
-        ["ส่งผลตรวจไปที่ [email protected] ด้วยครับ เลขบัตรประชาชนคือ 1234567890123"]
     ],
     allow_flagging="never"
 )

 import gradio as gr
+import pandas as pd
 from transformers import pipeline
 import re
+import os
+# 1. โหลดโมเดล NER (เหมือนเดิม)
 print("กำลังโหลดโมเดล...")
+# ตรวจสอบว่ามี HF_TOKEN ใน Secrets หรือไม่
+hf_token = os.getenv("HF_TOKEN")
+ner_pipeline = pipeline(
+    "token-classification",
+    model="loolootech/no-name-ner-th",
+    device=-1,
+    token=hf_token # ส่ง Token ไปด้วยตอนโหลดโมเดล
+)
 print("โมเดลพร้อมใช้งานแล้ว")
+# 2. ฟังก์ชันสำหรับรวม Token (เหมือนเดิม)
 def merge_entities(ner_results):
     merged_entities = []
     current_entity = None
     for entity in ner_results:
         entity_type = re.sub(r'^[BI]-', '', entity['entity'])
         if current_entity and entity['start'] == current_entity['end'] and entity_type == current_entity['type']:
             current_entity['word'] += entity['word']
             current_entity['end'] = entity['end']
             current_entity['score'] = max(current_entity['score'], entity['score'])
         else:
             if current_entity:
                 merged_entities.append(current_entity)
             current_entity = {
+                'type': entity_type, 'word': entity['word'],
+                'start': entity['start'], 'end': entity['end'], 'score': entity['score']
             }
     if current_entity:
         merged_entities.append(current_entity)
     return merged_entities
+# 3. ฟังก์ชันหลักสำหรับ De-identification ของข้อความ 1 บรรทัด (เหมือนเดิม)
+def deidentify_single_text(text):
+    if pd.isna(text) or not isinstance(text, str) or not text.strip():
+        return "" # คืนค่าเป็นสตริงว่างถ้าข้อมูลเป็นค่าว่าง, ไม่ใช่ข้อความ, หรือเป็นช่องว่าง
+    ner_results = ner_pipeline(text)
     merged = merge_entities(ner_results)
     redacted_text = text
     for entity in reversed(merged):
         start, end, label = entity['start'], entity['end'], entity['type']
         redacted_text = redacted_text[:start] + f"[{label}]" + redacted_text[end:]
+    return redacted_text
+# 4. [ใหม่] ฟังก์ชันสำห���ับประมวลผลไฟล์ที่อัปโหลด
+def process_file(uploaded_file, column_name, progress=gr.Progress(track_tqdm=True)):
+    if uploaded_file is None:
+        raise gr.Error("กรุณาอัปโหลดไฟล์ก่อน")
+    if not column_name:
+        raise gr.Error("กรุณาระบุ 'ชื่อคอลัมน์' ที่ต้องการตรวจสอบ")
+    file_path = uploaded_file.name
+    # อ่านไฟล์ด้วย Pandas
+    try:
+        if file_path.endswith('.csv'):
+            df = pd.read_csv(file_path)
+        elif file_path.endswith(('.xlsx', '.xls')):
+            df = pd.read_excel(file_path)
+        else:
+            raise gr.Error("ไฟล์ไม่รองรับ กรุณาอัปโหลด .csv หรือ .xlsx เท่านั้น")
+    except Exception as e:
+        raise gr.Error(f"ไม่สามารถอ่านไฟล์ได้: {e}")
+    # ตรวจสอบว่าชื่อคอลัมน์มีอยู่จริงในไฟล์หรือไม่
+    if column_name not in df.columns:
+        raise gr.Error(f"ไม่พบคอลัมน์ '{column_name}' ในไฟล์ของคุณ คอลัมน์ที่มีคือ: {list(df.columns)}")
+    # สร้างชื่อคอลัมน์ใหม่สำหรับผลลัพธ์
+    output_column_name = f"redacted_{column_name}"
+    # ประมวลผลข้อมูลในคอลัมน์ที่เลือก และแสดง progress bar
+    # ใช้ .astype(str) เพื่อแปลงข้อมูลทุกอย่างเป็นข้อความก่อนประมวลผล ป้องกัน error
+    df[output_column_name] = df[column_name].astype(str).progress_apply(deidentify_single_text)
+    # สร้างไฟล์ผลลัพธ์เพื่อให้ผู้ใช้ดาวน์โหลด
+    output_filepath = "processed_output.csv"
+    # ใช้ encoding 'utf-8-sig' เพื่อให้เปิดใน Excel ภาษาไทยไม่เพี้ยน
+    df.to_csv(output_filepath, index=False, encoding='utf-8-sig')
+    return df, output_filepath
+# 5. [ใหม่] สร้างหน้าเว็บ Gradio สำหรับอัปโหลดไฟล์
 iface = gr.Interface(
+    fn=process_file,
+    inputs=[
+        gr.File(label="อัปโหลดไฟล์ CSV หรือ Excel", file_types=[".csv", ".xlsx", ".xls"]),
+        gr.Textbox(label="ชื่อคอลัมน์ที่ต้องการตรวจสอบ (Column Name)", placeholder="เช่น: note, detail, description")
     ],
+    outputs=[
+        gr.DataFrame(label="ตารางผลลัพธ์ (Output Table Preview)", wrap=True),
+        gr.File(label="ดาวน์โหลดผลลัพธ์ (Download Result as CSV)")
     ],
+    title="📁 Bulk De-identification for CSV/Excel",
+    description="อัปโหลดไฟล์ตาราง (CSV, Excel) ระบุชื่อคอลัมน์ที่มีข้อความที่ต้องการปกปิดข้อมูลส่วนบุคคล แล้วระบบจะประมวลผลและสร้างไฟล์ใหม่ให้ดาวน์โหลด",
     allow_flagging="never"
 )