hai 1 ano · bed0e4d284
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
 
															-data_before

														
 
															-data_out

														
 
															-process
														
 
															+process

														
 
															+__pycache__

														
 
															+bert-train
														
--- a/client.py
+++ b/client.py
@@ -1,6 +1,6 @@
 
															 import requests, os

														
 
															-def cls_process(path, client_id, one_key, name_column, api_key="sk-iREtaVNjamaBArOTlc_2BfGFJVPiU-9EjSFMUspIPBT3BlbkFJxS0SMmKZD9L9UumPczee4VKawCwVeGBQAr9MgsWGkA",

														
 
															+def cls_process_openai(path, client_id, one_key, name_column, api_key="sk-iREtaVNjamaBArOTlc_2BfGFJVPiU-9EjSFMUspIPBT3BlbkFJxS0SMmKZD9L9UumPczee4VKawCwVeGBQAr9MgsWGkA",

														
 
															                 proxy=False, chunk_size=100):

														
 
															     # 定义请求的数据

														
 
															     data = {

														
@@ -13,7 +13,7 @@ def cls_process(path, client_id, one_key, name_column, api_key="sk-iREtaVNjamaBA
 
															         "chunk_size":chunk_size

														
 
															     }

														
 
															     # 发送 POST 请求

														
 
															-    response = requests.post("http://10.41.3.69:8070/classify/", json=data)

														
 
															+    response = requests.post("http://172.28.5.44:8070/classify_openai/", json=data)

														
 
															     # 处理响应

														
 
															     if response.status_code == 200:

														
@@ -23,7 +23,23 @@ def cls_process(path, client_id, one_key, name_column, api_key="sk-iREtaVNjamaBA
 
															         print(f"请求失败，状态码: {response.status_code}")

														
 
															         print("错误信息:", response.text)

														
 
															+def cls_process_bert(path, client_id, name_column):

														
 
															+    # 定义请求的数据

														
 
															+    data = {

														
 
															+        "path": path,

														
 
															+        "client_id": client_id,

														
 
															+        "name_column": name_column

														
 
															+    }

														
 
															+    # 发送 POST 请求

														
 
															+    response = requests.post("http://172.28.5.44:8070/classify_bert/", json=data)

														
 
															+    # 处理响应

														
 
															+    if response.status_code == 200:

														
 
															+        result = response.json()

														
 
															+        print("接口响应:", result)

														
 
															+    else:

														
 
															+        print(f"请求失败，状态码: {response.status_code}")

														
 
															+        print("错误信息:", response.text)

														
 
															 def upload_process(file_path, client_id):

														
 
															     # 表单数据

														
 
															     data = {

														
@@ -36,7 +52,7 @@ def upload_process(file_path, client_id):
 
															         files = {

														
 
															             'file': (name, file, 'text/plain')  # 'text/plain' 是文件类型，可以根据需要修改

														
 
															         }

														
 
															-        response = requests.post("http://10.41.3.69:8070/uploadfile/", data=data, files=files)

														
 
															+        response = requests.post("http://ip:port/uploadfile/", data=data, files=files)

														
 
															     # 处理响应

														
 
															     if response.status_code == 200:

														
@@ -51,10 +67,11 @@ def upload_process(file_path, client_id):
 
															 def both_process(path, client_id, one_key, name_column):

														
 
															     file_path = upload_process(file_path=path, client_id=client_id)

														
 
															     if file_path:

														
 
															-        cls_process(path=file_path, client_id=client_id, one_key=one_key, name_column=name_column)

														
 
															+        cls_process_openai(path=file_path, client_id=client_id, one_key=one_key, name_column=name_column)

														
 
															 if __name__ == "__main__":

														
 
															-    both_process(path="E:/code/name_classify/data_before/人群分类-自建站2024.08.xlsx", client_id='wangdalin', one_key='网店单号', name_column='收货姓名')

														
 
															+    # both_process(path="E:/code/name_classify/data_before/人群分类-自建站2024.08.xlsx", client_id='wangdalin', one_key='网店单号', name_column='收货姓名')

														
 
															     # upload_process(file_path="E:/code/name_classify/data_before/人群分类-自建站2024.08.xlsx", client_id='wangdalin')

														
 
															-    # cls_process(path="./process/wangdalin\\人群分类-自建站2024.08.xlsx",client_id='wangdalin', one_key='网店单号', name_column='收货姓名')

														
 
															+    # cls_process_openai(path="./process/wangdalin\\人群分类-自建站2024.08.xlsx",client_id='wangdalin', one_key='网店单号', name_column='收货姓名')

														
 
															+    cls_process_bert(path="./process/wangdalin/人群分类-自建站2024.08.xlsx",client_id='wangdalin', name_column='收货姓名')

														
--- a/config.py
+++ b/config.py
@@ -20,4 +20,6 @@ user_prompt = """提供的数据:{chunk}
 
															                 返回的数据:"""

														
 
															 port = 8070

														
 
															 file_base_url = f'http://{get_ip_address()}:{port}/data'

														
 
															-

														
 
															+model_save_path = '/mnt/e/code/name_classify/bert/model/bert-name-classify/best_model_2024_9_4.pkl'

														
 
															+pre_train_model = '/mnt/e/code/name_classify/bert/model/bert-name-classify'

														
 
															+label_revert_map = {0:'亚裔华人', 1:'亚裔非华人', 2:'非亚裔'}

														
--- a/name_classify_api.py
+++ b/name_classify_api.py
@@ -1,5 +1,5 @@
 
															 import pandas as pd 

														
 
															-import os, time, shutil

														
 
															+import os, time, shutil, sys

														
 
															 import openai

														
 
															 from fastapi import FastAPI, UploadFile, File, Form

														
 
															 from pydantic import BaseModel

														
@@ -9,7 +9,14 @@ import uvicorn, socket
 
															 from tqdm import tqdm

														
 
															 from fastapi.staticfiles import StaticFiles

														
 
															 from config import *

														
 
															+

														
 
															 from functions import split_dataframe_to_dict, extract_list_from_string

														
 
															+sys.path.append(os.path.join(os.path.dirname(__file__), 'bert'))

														
 
															+import torch

														
 
															+# from model import BertClassifier

														
 
															+from bert.model import BertClassifier

														
 
															+from transformers import BertTokenizer, BertConfig

														
 
															+

														
 
															 app = FastAPI()

														
 
															 app.mount("/data", StaticFiles(directory='./process'), name="static")

														
 
															 class ClassificationRequest(BaseModel):

														
@@ -21,6 +28,90 @@ class ClassificationRequest(BaseModel):
 
															     proxy: bool = False

														
 
															     chunk_size: int = 100

														
 
															+class ClassificationRequestBert(BaseModel):

														
 
															+    path: str

														
 
															+    client_id: str

														
 
															+    name_column: str

														
 
															+

														
 
															+bert_config = BertConfig.from_pretrained(pre_train_model)

														
 
															+# 定义模型

														
 
															+model = BertClassifier(bert_config, len(label_revert_map.keys()))

														
 
															+# 加载训练好的模型

														
 
															+model.load_state_dict(torch.load(model_save_path, map_location=torch.device('cpu')))

														
 
															+model.eval()

														
 
															+tokenizer = BertTokenizer.from_pretrained(pre_train_model)

														
 
															+

														
 
															+def bert_predict(text):

														
 
															+    token = tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True, max_length=512)

														
 
															+    input_ids = token['input_ids']

														
 
															+    attention_mask = token['attention_mask']

														
 
															+    token_type_ids = token['token_type_ids']

														
 
															+

														
 
															+    input_ids = torch.tensor([input_ids], dtype=torch.long)

														
 
															+    attention_mask = torch.tensor([attention_mask], dtype=torch.long)

														
 
															+    token_type_ids = torch.tensor([token_type_ids], dtype=torch.long)

														
 
															+

														
 
															+    predicted = model(

														
 
															+        input_ids,

														
 
															+        attention_mask,

														
 
															+        token_type_ids,

														
 
															+    )

														
 
															+    pred_label = torch.argmax(predicted, dim=1).numpy()[0]

														
 
															+    

														
 
															+    return label_revert_map[pred_label]

														
 
															+def predict_excel(file_path, name_col, temp_path, save_path, chunksize=5):

														
 
															+    # 初始化变量

														
 
															+    error_file_name, error_file_extension = os.path.splitext(os.path.basename(save_path))

														
 
															+    # 添加后缀

														
 
															+    error_file = error_file_name + '_error' + error_file_extension

														
 
															+    origin_csv_file = error_file_name + '_origin.csv'

														
 
															+    # 生成新的文件路径

														
 
															+    error_file_path = os.path.join(os.path.dirname(save_path), error_file)

														
 
															+    origin_csv_path = os.path.join(os.path.dirname(save_path), origin_csv_file)

														
 
															+    total_processed = 0

														
 
															+    df_origin = pd.read_excel(file_path)

														
 
															+    df_error = pd.DataFrame(columns=df_origin.columns)

														
 
															+    df_origin.to_csv(origin_csv_path, index=False)

														
 
															+    # 按块读取 CSV 文件

														
 
															+    

														
 
															+    for chunk in tqdm(pd.read_csv(origin_csv_path, chunksize=chunksize, iterator=True), desc='Processing', unit='item'):

														
 
															+        try:

														
 
															+            # 对每个块进行处理

														
 
															+            chunk['classify'] = chunk[name_col].apply(bert_predict)

														
 
															+            # 增量保存处理结果

														
 
															+            if total_processed == 0:

														
 
															+                chunk.to_csv(temp_path, mode='w', index=False)

														
 
															+            else:

														
 
															+                chunk.to_csv(temp_path, mode='a', header=False, index=False)

														
 
															+            # 更新已处理的数据量

														
 
															+            total_processed += len(chunk)

														
 
															+        except Exception as e:

														
 
															+            df_error = pd.concat([df_error, chunk])

														
 
															+    df_final = pd.read_csv(temp_path)

														
 
															+    df_final.to_excel(save_path)

														
 
															+    os.remove(origin_csv_path)

														
 
															+    if len(df_error) == 0:

														
 
															+        return save_path, '', 0

														
 
															+    else:

														
 
															+        df_error.to_excel(error_file_path)

														
 
															+        return save_path, error_file_path, len(df_error)

														
 
															+def remove_files():

														
 
															+    current_time = time.time()

														
 
															+    TIME_THRESHOLD_FILEPATH = 30 * 24 * 60 * 60

														
 
															+    TIME_THRESHOLD_FILE = 10 * 24 * 60 * 60

														
 
															+    for root, dirs, files in os.walk(basic_path, topdown=False):

														
 
															+        # 删除文件

														
 
															+        for file in files:

														
 
															+            file_path = os.path.join(root, file)

														
 
															+            if current_time - os.path.getmtime(file_path) > TIME_THRESHOLD_FILE:

														
 
															+                print(f"删除文件: {file_path}")

														
 
															+                os.remove(file_path)

														
 
															+        # 删除文件夹

														
 
															+        for dir in dirs:

														
 
															+            dir_path = os.path.join(root, dir)

														
 
															+            if current_time - os.path.getmtime(dir_path) > TIME_THRESHOLD_FILEPATH:

														
 
															+                print(f"删除文件夹: {dir_path}")

														
 
															+                shutil.rmtree(dir_path)

														
 
															 @app.post("/uploadfile/")

														
 
															 async def create_upload_file(file: UploadFile = File(...), client_id: str = Form(...)):

														
 
															     user_directory = f'{basic_path}/{client_id}'

														
@@ -40,26 +131,10 @@ async def create_upload_file(file: UploadFile = File(...), client_id: str = Form
 
															     except Exception as e:

														
 
															         return JSONResponse(content={"message": f"发生错误: {str(e)}"}, status_code=500)

														
 
															-@app.post("/classify/")

														
 
															+@app.post("/classify_openai/")

														
 
															 async def classify_data(request: ClassificationRequest):

														
 
															     try:

														
 
															-        current_time = time.time()

														
 
															-        TIME_THRESHOLD_FILEPATH = 30 * 24 * 60 * 60

														
 
															-        TIME_THRESHOLD_FILE = 10 * 24 * 60 * 60

														
 
															-        for root, dirs, files in os.walk(basic_path, topdown=False):

														
 
															-            # 删除文件

														
 
															-            for file in files:

														
 
															-                file_path = os.path.join(root, file)

														
 
															-                if current_time - os.path.getmtime(file_path) > TIME_THRESHOLD_FILE:

														
 
															-                    print(f"删除文件: {file_path}")

														
 
															-                    os.remove(file_path)

														
 
															-            # 删除文件夹

														
 
															-            for dir in dirs:

														
 
															-                dir_path = os.path.join(root, dir)

														
 
															-                if current_time - os.path.getmtime(dir_path) > TIME_THRESHOLD_FILEPATH:

														
 
															-                    print(f"删除文件夹: {dir_path}")

														
 
															-                    shutil.rmtree(dir_path)

														
 
															-        

														
 
															+        remove_files()

														
 
															         work_path = f'{basic_path}/{request.client_id}'

														
 
															         if not os.path.exists(work_path):

														
 
															             os.makedirs(work_path, exist_ok=True)

														
@@ -111,5 +186,24 @@ async def classify_data(request: ClassificationRequest):
 
															     except Exception as e:

														
 
															         return {"message": f"处理出现错误: {e}"}

														
 
															+@app.post("/classify_bert/")

														
 
															+async def classify_data(request: ClassificationRequestBert):

														
 
															+    remove_files()

														
 
															+    work_path = f'{basic_path}/{request.client_id}'

														
 
															+    if not os.path.exists(work_path):

														
 
															+        os.makedirs(work_path, exist_ok=True)

														
 
															+    final_file_name, final_file_extension = os.path.splitext(os.path.basename(request.path))

														
 
															+    # 添加后缀

														
 
															+    final_file = final_file_name + '_classify' + final_file_extension

														
 
															+

														
 
															+    # 生成新的文件路径

														
 
															+    new_file_path = os.path.join(os.path.dirname(request.path), final_file)

														
 
															+    timestamp_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

														
 
															+    temp_csv = work_path + '/' + timestamp_str + 'output_temp.csv'

														
 
															+    save_path, error_path, error_len = predict_excel(request.path, request.name_column, temp_path=temp_csv ,save_path=new_file_path)

														
 
															+    if error_len == 0:

														
 
															+        return {"message": "分类完成", "output_file": file_base_url + save_path.split(basic_path)[1]}

														
 
															+    else:

														
 
															+        return {"message": "分类完成只完成部分", "output_file": file_base_url + save_path.split(basic_path)[1], "output_file_nonprocess":file_base_url + save_path.split(error_path)[1],}

														
 
															 if __name__ == "__main__":

														
 
															     uvicorn.run(app, host="0.0.0.0", port=port)