Spaces:

Rathapoom
/

thai-ner-demo

Runtime error

App Files Files Community

Rathapoom commited on Sep 30

Commit

f5a7ece

verified ·

1 Parent(s): e1c1c45

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -28

app.py CHANGED Viewed

@@ -6,13 +6,12 @@ import os
 # 1. โหลดโมเดล NER (เหมือนเดิม)
 print("กำลังโหลดโมเดล...")
-# ตรวจสอบว่ามี HF_TOKEN ใน Secrets หรือไม่
 hf_token = os.getenv("HF_TOKEN")
 ner_pipeline = pipeline(
     "token-classification",
     model="loolootech/no-name-ner-th",
     device=-1,
-    token=hf_token # ส่ง Token ไปด้วยตอนโหลดโมเดล
 )
 print("โมเดลพร้อมใช้งานแล้ว")
@@ -39,10 +38,10 @@ def merge_entities(ner_results):
     return merged_entities
-# 3. ฟังก์ชันหลักสำหรับ De-identification ของข้อความ 1 บรรทัด (เหมือนเดิม)
 def deidentify_single_text(text):
     if pd.isna(text) or not isinstance(text, str) or not text.strip():
-        return "" # คืนค่าเป็นสตริงว่างถ้าข้อมูลเป็นค่าว่าง, ไม่ใช่ข้อความ, หรือเป็นช่องว่าง
     ner_results = ner_pipeline(text)
     merged = merge_entities(ner_results)
@@ -55,12 +54,10 @@ def deidentify_single_text(text):
     return redacted_text
-# 4. [ใหม่] ฟังก์ชันสำหรับประมวลผลไฟล์ที่อัปโหลด
-def process_file(uploaded_file, column_name, progress=gr.Progress(track_tqdm=True)):
     if uploaded_file is None:
         raise gr.Error("กรุณาอัปโหลดไฟล์ก่อน")
-    if not column_name:
-        raise gr.Error("กรุณาระบุ 'ชื่อคอลัมน์' ที่ต้องการตรวจสอบ")
     file_path = uploaded_file.name
@@ -75,38 +72,39 @@ def process_file(uploaded_file, column_name, progress=gr.Progress(track_tqdm=Tru
     except Exception as e:
         raise gr.Error(f"ไม่สามารถอ่านไฟล์ได้: {e}")
-    # ตรวจสอบว่าชื่อคอลัมน์มีอยู่จริงในไฟล์หรือไม่
-    if column_name not in df.columns:
-        raise gr.Error(f"ไม่พบคอลัมน์ '{column_name}' ในไฟล์ของคุณ คอลัมน์ที่มีคือ: {list(df.columns)}")
-    # สร้างชื่อคอลัมน์ใหม่สำหรับผลลัพธ์
-    output_column_name = f"redacted_{column_name}"
-    # ประมวลผลข้อมูลในคอลัมน์ที่เลือก และแสดง progress bar
-    # ใช้ .astype(str) เพื่อแปลงข้อมูลทุกอย่างเป็นข้อความก่อนประมวลผล ป้องกัน error
-    df[output_column_name] = df[column_name].astype(str).progress_apply(deidentify_single_text)
     # สร้างไฟล์ผลลัพธ์เพื่อให้ผู้ใช้ดาวน์โหลด
-    output_filepath = "processed_output.csv"
-    # ใช้ encoding 'utf-8-sig' เพื่อให้เปิดใน Excel ภาษาไทยไม่เพี้ยน
-    df.to_csv(output_filepath, index=False, encoding='utf-8-sig')
-    return df, output_filepath
-# 5. [ใหม่] สร้างหน้าเว็บ Gradio ��ำหรับอัปโหลดไฟล์
 iface = gr.Interface(
-    fn=process_file,
     inputs=[
-        gr.File(label="อัปโหลดไฟล์ CSV หรือ Excel", file_types=[".csv", ".xlsx", ".xls"]),
-        gr.Textbox(label="ชื่อคอลัมน์ที่ต้องการตรวจสอบ (Column Name)", placeholder="เช่น: note, detail, description")
     ],
     outputs=[
-        gr.DataFrame(label="ตารางผลลัพธ์ (Output Table Preview)", wrap=True),
         gr.File(label="ดาวน์โหลดผลลัพธ์ (Download Result as CSV)")
     ],
-    title="📁 Bulk De-identification for CSV/Excel",
-    description="อัปโหลดไฟล์ตาราง (CSV, Excel) ระบุชื่อคอลัมน์ที่มีข้อความที่ต้องการปกปิดข้อมูลส่วนบุคคล แล้วระบบจะประมวลผลและสร้างไฟล์ใหม่ให้ดาวน์โหลด",
     allow_flagging="never"
 )

 # 1. โหลดโมเดล NER (เหมือนเดิม)
 print("กำลังโหลดโมเดล...")
 hf_token = os.getenv("HF_TOKEN")
 ner_pipeline = pipeline(
     "token-classification",
     model="loolootech/no-name-ner-th",
     device=-1,
+    token=hf_token
 )
 print("โมเดลพร้อมใช้งานแล้ว")
     return merged_entities
+# 3. ฟังก์ชันสำหรับ De-identification ของข้อความ 1 บรรทัด (เหมือนเดิม)
 def deidentify_single_text(text):
     if pd.isna(text) or not isinstance(text, str) or not text.strip():
+        return ""
     ner_results = ner_pipeline(text)
     merged = merge_entities(ner_results)
     return redacted_text
+# 4. [อัปเดต] ฟังก์ชันสำหรับประมวลผลไฟล์ (ไม่ต้องรับชื่อคอลัมน์แล้ว)
+def process_entire_file(uploaded_file, progress=gr.Progress(track_tqdm=True)):
     if uploaded_file is None:
         raise gr.Error("กรุณาอัปโหลดไฟล์ก่อน")
     file_path = uploaded_file.name
     except Exception as e:
         raise gr.Error(f"ไม่สามารถอ่านไฟล์ได้: {e}")
+    # สร้าง DataFrame ใหม่สำหรับเก็บผลลัพธ์
+    df_redacted = df.copy()
+    # [Key Change] ค้นหาคอลัมน์ทั้งหมดที่มีข้อมูลเป็นประเภทข้อความ (object)
+    text_columns = df.select_dtypes(include=['object']).columns
+    if len(text_columns) == 0:
+        raise gr.Error("ไม่พบคอลัมน์ที่เป็นข้อมูลประเภทข้อความ (text) ในไฟล์นี้เลย")
+    # วนลูปและประมวลผลทุกคอลัมน์ที่หาเจอ
+    print(f"กำลังประมวลผลคอลัมน์: {list(text_columns)}")
+    for col_name in progress.tqdm(text_columns, desc="Processing text columns"):
+        df_redacted[col_name] = df[col_name].astype(str).apply(deidentify_single_text)
     # สร้างไฟล์ผลลัพธ์เพื่อให้ผู้ใช้ดาวน์โหลด
+    output_filepath = "processed_output_full.csv"
+    df_redacted.to_csv(output_filepath, index=False, encoding='utf-8-sig')
+    return df_redacted, output_filepath
+# 5. [อัปเดต] สร้างหน้าเว็บ Gradio (ตัดช่องใส่ชื่อคอลัมน์ออก)
 iface = gr.Interface(
+    fn=process_entire_file,
     inputs=[
+        gr.File(label="อัปโหลดไฟล์ CSV หรือ Excel ที่ต้องการตรวจสอบทั้งตาราง", file_types=[".csv", ".xlsx", ".xls"])
     ],
     outputs=[
+        gr.DataFrame(label="ตารางผลลัพธ์ (Output Table Preview)", wrap=True, max_rows=10),
         gr.File(label="ดาวน์โหลดผลลัพธ์ (Download Result as CSV)")
     ],
+    title="📁 Automatic Table De-identification",
+    description="อัปโหลดไฟล์ตาราง (CSV, Excel) แล้วระบบจะค้นหาคอลัมน์ที่เป็น 'ข้อความ' ทั้งหมดโดยอัตโนมัติ และทำการปกปิดข้อมูลส่วนบุคคลให้ทันที",
     allow_flagging="never"
 )