Browse Source

增加readme, 提取config, 整理client

wangdalin 10 tháng trước cách đây
mục cha
commit
e627f79d74
6 tập tin đã thay đổi với 318 bổ sung139 xóa
  1. 60 48
      client.py
  2. 23 0
      config.py
  3. 74 0
      functions.py
  4. 18 91
      name_classify_api.py
  5. 131 0
      readme.md
  6. 12 0
      tools/md2word.py

+ 60 - 48
client.py

@@ -1,48 +1,60 @@
-import requests
-
-# 定义请求的数据
-data = {
-    "path": "./process/wangdalin\\人群分类-自建站2024.08.xlsx",
-    "client_id": "wangdalin",
-    "one_key": "网店单号",
-    "name_column": "收货姓名",
-    "api_key": "sk-iREtaVNjamaBArOTlc_2BfGFJVPiU-9EjSFMUspIPBT3BlbkFJxS0SMmKZD9L9UumPczee4VKawCwVeGBQAr9MgsWGkA",
-    "proxy": False,  # 根据需要设置为 True 或 False
-    "chunk_size":100
-}
-
-# 发送 POST 请求
-response = requests.post("http://10.41.3.69:8070/classify/", json=data)
-
-# 处理响应
-if response.status_code == 200:
-    result = response.json()
-    print("接口响应:", result)
-else:
-    print(f"请求失败,状态码: {response.status_code}")
-    print("错误信息:", response.text)
-
-
-################################
-# # 文件路径
-# file_path = 'E:/code/name_classify/data_before/人群分类-自建站2024.08.xlsx'
-
-# # 表单数据
-# data = {
-#     'client_id': 'wangdalin'
-# }
-
-# # 发送 POST 请求
-# with open(file_path, 'rb') as file:
-#     files = {
-#         'file': ('人群分类-自建站2024.08.xlsx', file, 'text/plain')  # 'text/plain' 是文件类型,可以根据需要修改
-#     }
-#     response = requests.post("http://10.41.3.69:8070/uploadfile/", data=data, files=files)
-
-# # 处理响应
-# if response.status_code == 200:
-#     result = response.json()
-#     print("接口响应:", result)
-# else:
-#     print(f"请求失败,状态码: {response.status_code}")
-#     print("错误信息:", response.text)
+import requests, os
+
+def cls_process(path, client_id, one_key, name_column, api_key="sk-iREtaVNjamaBArOTlc_2BfGFJVPiU-9EjSFMUspIPBT3BlbkFJxS0SMmKZD9L9UumPczee4VKawCwVeGBQAr9MgsWGkA",
+                proxy=False, chunk_size=100):
+    # 定义请求的数据
+    data = {
+        "path": path,
+        "client_id": client_id,
+        "one_key": one_key,
+        "name_column": name_column,
+        "api_key": api_key,
+        "proxy": proxy,  
+        "chunk_size":chunk_size
+    }
+    # 发送 POST 请求
+    response = requests.post("http://10.41.3.69:8070/classify/", json=data)
+
+    # 处理响应
+    if response.status_code == 200:
+        result = response.json()
+        print("接口响应:", result)
+    else:
+        print(f"请求失败,状态码: {response.status_code}")
+        print("错误信息:", response.text)
+
+
+def upload_process(file_path, client_id):
+    # 表单数据
+    data = {
+        'client_id': client_id
+    }
+
+    # 发送 POST 请求
+    with open(file_path, 'rb') as file:
+        name = os.path.basename(file_path)
+        files = {
+            'file': (name, file, 'text/plain')  # 'text/plain' 是文件类型,可以根据需要修改
+        }
+        response = requests.post("http://10.41.3.69:8070/uploadfile/", data=data, files=files)
+
+    # 处理响应
+    if response.status_code == 200:
+        result = response.json()
+        print("接口响应:", result)
+        return result.get('file_path')
+    else:
+        print(f"请求失败,状态码: {response.status_code}")
+        print("错误信息:", response.text)
+        return None
+
+def both_process(path, client_id, one_key, name_column):
+    file_path = upload_process(file_path=path, client_id=client_id)
+    if file_path:
+        cls_process(path=file_path, client_id=client_id, one_key=one_key, name_column=name_column)
+
+if __name__ == "__main__":
+    both_process(path="E:/code/name_classify/data_before/人群分类-自建站2024.08.xlsx", client_id='wangdalin', one_key='网店单号', name_column='收货姓名')
+    # upload_process(file_path="E:/code/name_classify/data_before/人群分类-自建站2024.08.xlsx", client_id='wangdalin')
+    # cls_process(path="./process/wangdalin\\人群分类-自建站2024.08.xlsx",client_id='wangdalin', one_key='网店单号', name_column='收货姓名')
+

+ 23 - 0
config.py

@@ -0,0 +1,23 @@
+import socket
+def get_ip_address():
+    try:
+        # 创建一个socket对象
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        # 连接到一个公共DNS服务器(此处以Google DNS为例)
+        s.connect(("8.8.8.8", 80))
+        # 获取本机的IP地址
+        ip_address = s.getsockname()[0]
+        s.close()
+        return ip_address
+    except Exception as e:
+        return str(e)
+api_key = "sk-iREtaVNjamaBArOTlc_2BfGFJVPiU-9EjSFMUspIPBT3BlbkFJxS0SMmKZD9L9UumPczee4VKawCwVeGBQAr9MgsWGkA"
+basic_path = './process'
+openai_url = 'https://api.openai.com/v1'
+proxy_url = 'https://fast.bemore.lol/v1'
+cls_system_prompt = '你是一个名字判断专家,你需要根据提供的列表中的每一个字典元素的会员姓名,判断其名字分类,分别为3类: 亚裔华人,亚裔非华人, 非亚裔,并将结果填充到会员分类中, 整合之后返回与提供数据一样的格式给我'
+user_prompt = """提供的数据:{chunk}
+                返回的数据:"""
+port = 8070
+file_base_url = f'http://{get_ip_address()}:{port}/data'
+

+ 74 - 0
functions.py

@@ -0,0 +1,74 @@
+import re, ast, requests
+import math, json
+def save_dict_to_json(dictionary, filename):
+    with open(filename, 'w', encoding='utf-8') as file:
+        json.dump(dictionary, file, ensure_ascii=False, indent=4)
+
+def load_dict_from_json(filename):
+    with open(filename, 'r', encoding='utf-8') as file:
+        return json.load(file)
+def split_dataframe_to_dict(df, chunk_size=100):
+    # 计算需要切割的份数
+    num_chunks = math.ceil(len(df) / chunk_size)
+    
+    # 用于存储结果的字典
+    result_dict = {}
+    
+    for i in range(num_chunks):
+        # 切割 DataFrame
+        start = i * chunk_size
+        end = min((i + 1) * chunk_size, len(df))
+        chunk = df.iloc[start:end]
+        
+        # 将切割后的 DataFrame 转换为字典并存储
+        result_dict[f'chunk_{i+1}'] = chunk.to_dict(orient='records')
+    
+    return result_dict
+def extract_list_from_string(input_string):
+    # 使用正则表达式查找列表部分
+    list_pattern = r'\[.*?\]'
+    match = re.search(list_pattern, input_string, re.DOTALL)
+    
+    if match:
+        list_string = match.group()
+        try:
+            # 使用 ast.literal_eval 安全地解析字符串
+            result = ast.literal_eval(list_string)
+            
+            # 检查结果是否为列表
+            if isinstance(result, list):
+                return result
+            else:
+                print("解析结果不是列表")
+                return None
+        except Exception as e:
+            print(f"解析错误: {e}")
+            return None
+    else:
+        print("未找到列表结构")
+        return None
+def post_openai(messages):
+   Baseurl = "https://fast.bemore.lol"
+   Skey = "sk-dxl4rt2wWswbdrCr1c7b8500B68c43F5B6175b90F7D672C4"
+   payload = json.dumps({
+      "model": "gpt-4",
+      "messages": messages
+   })
+   url = Baseurl + "/v1/chat/completions"
+   headers = {
+      'Accept': 'application/json',
+      'Authorization': f'Bearer {Skey}',
+      'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
+      'Content-Type': 'application/json'
+   }
+
+   response = requests.request("POST", url, headers=headers, data=payload)
+
+   # 解析 JSON 数据为 Python 字典
+   print(response)
+   data = response.json()
+
+   # 获取 content 字段的值
+   content = data['choices'][0]['message']['content']
+
+   return content

+ 18 - 91
name_classify_api.py

@@ -1,104 +1,33 @@
 import pandas as pd 
-import math, json, os, time, shutil
-import openai, re, ast, requests
+import os, time, shutil
+import openai
 from fastapi import FastAPI, UploadFile, File, Form
 from pydantic import BaseModel
 from fastapi.responses import JSONResponse
 from datetime import datetime
-import uvicorn
+import uvicorn, socket
 from tqdm import tqdm
+from fastapi.staticfiles import StaticFiles
+from config import *
+from functions import split_dataframe_to_dict, extract_list_from_string
 app = FastAPI()
+app.mount("/data", StaticFiles(directory='./process'), name="static")
 class ClassificationRequest(BaseModel):
     path: str
     client_id: str
     one_key: str
     name_column: str
-    api_key: str
-    proxy: bool
-    chunk_size: int
-def save_dict_to_json(dictionary, filename):
-    with open(filename, 'w', encoding='utf-8') as file:
-        json.dump(dictionary, file, ensure_ascii=False, indent=4)
-
-def load_dict_from_json(filename):
-    with open(filename, 'r', encoding='utf-8') as file:
-        return json.load(file)
-def split_dataframe_to_dict(df, chunk_size=100):
-    # 计算需要切割的份数
-    num_chunks = math.ceil(len(df) / chunk_size)
-    
-    # 用于存储结果的字典
-    result_dict = {}
-    
-    for i in range(num_chunks):
-        # 切割 DataFrame
-        start = i * chunk_size
-        end = min((i + 1) * chunk_size, len(df))
-        chunk = df.iloc[start:end]
-        
-        # 将切割后的 DataFrame 转换为字典并存储
-        result_dict[f'chunk_{i+1}'] = chunk.to_dict(orient='records')
-    
-    return result_dict
-def extract_list_from_string(input_string):
-    # 使用正则表达式查找列表部分
-    list_pattern = r'\[.*?\]'
-    match = re.search(list_pattern, input_string, re.DOTALL)
-    
-    if match:
-        list_string = match.group()
-        try:
-            # 使用 ast.literal_eval 安全地解析字符串
-            result = ast.literal_eval(list_string)
-            
-            # 检查结果是否为列表
-            if isinstance(result, list):
-                return result
-            else:
-                print("解析结果不是列表")
-                return None
-        except Exception as e:
-            print(f"解析错误: {e}")
-            return None
-    else:
-        print("未找到列表结构")
-        return None
-def post_openai(messages):
-   Baseurl = "https://fast.bemore.lol"
-   Skey = "sk-dxl4rt2wWswbdrCr1c7b8500B68c43F5B6175b90F7D672C4"
-   payload = json.dumps({
-      "model": "gpt-4",
-      "messages": messages
-   })
-   url = Baseurl + "/v1/chat/completions"
-   headers = {
-      'Accept': 'application/json',
-      'Authorization': f'Bearer {Skey}',
-      'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
-      'Content-Type': 'application/json'
-   }
-
-   response = requests.request("POST", url, headers=headers, data=payload)
-
-   # 解析 JSON 数据为 Python 字典
-   print(response)
-   data = response.json()
-
-   # 获取 content 字段的值
-   content = data['choices'][0]['message']['content']
-
-   return content
+    api_key: str = "sk-iREtaVNjamaBArOTlc_2BfGFJVPiU-9EjSFMUspIPBT3BlbkFJxS0SMmKZD9L9UumPczee4VKawCwVeGBQAr9MgsWGkA"
+    proxy: bool = False
+    chunk_size: int = 100
 
 @app.post("/uploadfile/")
 async def create_upload_file(file: UploadFile = File(...), client_id: str = Form(...)):
-    user_directory = f'./process/{client_id}'
+    user_directory = f'{basic_path}/{client_id}'
     if not os.path.exists(user_directory):
         os.makedirs(user_directory)
         os.chmod(user_directory, 0o777)  # 设置用户目录权限为777
-    print(user_directory)
-    print(file.filename)
     file_location = os.path.join(user_directory, file.filename)
-    print(file_location)
     try:
         with open(file_location, "wb+") as file_object:
             file_object.write(file.file.read())
@@ -117,7 +46,6 @@ async def classify_data(request: ClassificationRequest):
         current_time = time.time()
         TIME_THRESHOLD_FILEPATH = 30 * 24 * 60 * 60
         TIME_THRESHOLD_FILE = 10 * 24 * 60 * 60
-        basic_path = './process'
         for root, dirs, files in os.walk(basic_path, topdown=False):
             # 删除文件
             for file in files:
@@ -131,8 +59,7 @@ async def classify_data(request: ClassificationRequest):
                 if current_time - os.path.getmtime(dir_path) > TIME_THRESHOLD_FILEPATH:
                     print(f"删除文件夹: {dir_path}")
                     shutil.rmtree(dir_path)
-        prompt = """提供的数据:{chunk}
-                返回的数据:"""
+        
         work_path = f'{basic_path}/{request.client_id}'
         if not os.path.exists(work_path):
             os.makedirs(work_path, exist_ok=True)
@@ -154,14 +81,14 @@ async def classify_data(request: ClassificationRequest):
 
         if not request.proxy:
             print(f'用户{request.client_id}正在使用直连的gpt-API')
-            client = openai.OpenAI(api_key=request.api_key, base_url='https://api.openai.com/v1')
+            client = openai.OpenAI(api_key=request.api_key, base_url=openai_url)
         else:
-            client = openai.OpenAI(api_key=request.api_key, base_url='https://fast.bemore.lol/v1')
+            client = openai.OpenAI(api_key=request.api_key, base_url=proxy_url)
         for name, value in tqdm(deal_result.items(), desc='Processing', unit='item'):
             try:
                 message = [
-                    {'role':'system', 'content': '你是一个名字判断专家,你需要根据提供的列表中的每一个字典元素的会员姓名,判断其名字分类,分别为3类: 亚裔华人,亚裔非华人, 非亚裔,并将结果填充到会员分类中, 整合之后返回与提供数据一样的格式给我'},
-                    {'role':'user', 'content':prompt.format(chunk=str(value))}
+                    {'role':'system', 'content': cls_system_prompt},
+                    {'role':'user', 'content':user_prompt.format(chunk=str(value))}
                 ]
                 # result_string = post_openai(message)
                 response = client.chat.completions.create(model='gpt-4',messages=message)
@@ -178,11 +105,11 @@ async def classify_data(request: ClassificationRequest):
             df_result = pd.read_csv(temp_csv)
             df_final = df_origin.merge(df_result, on='name', how='left').drop_duplicates(subset=[request.one_key,'name'], keep='first')
             df_final.to_excel(new_file_path)
-            return {"message": "分类完成", "output_file": new_file_path}
+            return {"message": "分类完成", "output_file": file_base_url + new_file_path.split(basic_path)[1]}
         else:
             return {"message": "文件没能处理成功"}
     except Exception as e:
         return {"message": f"处理出现错误: {e}"}
 
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8070)
+    uvicorn.run(app, host="0.0.0.0", port=port)

+ 131 - 0
readme.md

@@ -0,0 +1,131 @@
+以下是这两个接口的接口说明文档,使用Markdown格式编写:
+
+---
+
+## API 接口文档
+
+### 1. 文件上传接口
+
+**接口路径**: `/upload/`
+
+**请求方法**: `POST`
+
+**请求参数**:
+
+| 参数名称      | 类型           | 必须 | 描述                       |
+| ------------- | -------------- | ---- | -------------------------- |
+| `file`      | `UploadFile` | 是   | 要上传的文件               |
+| `client_id` | `string`     | 是   | 客户端ID,用于创建用户目录 |
+
+**功能描述**:
+
+此接口用于上传文件,并将文件存储到服务器上的指定目录中。每个客户端都有自己的独立目录。上传文件后,系统会为文件和目录设置权限。
+
+**示例请求**:
+
+```bash
+curl -X POST "http://{server_address}/upload/" \
+  -F "file=@path/to/your/file.xlsx" \
+  -F "client_id=12345"
+```
+
+**响应参数**:
+
+- 成功响应:
+
+```json
+{
+  "message": "文件 'your_file.xlsx' 上传成功",
+  "client_id": "12345",
+  "file_path": "./process/12345/your_file.xlsx"
+}
+```
+
+- 错误响应:
+
+```json
+{
+  "message": "发生错误: [错误信息]"
+}
+```
+
+---
+
+### 2. 数据分类接口
+
+**接口路径**: `/classify/`
+
+**请求方法**: `POST`
+
+**请求参数**:
+
+
+
+数据分类接口参数列表
+
+| 参数名称        | 类型        | 必须 | 描述                                                                                        |
+| --------------- | ----------- | ---- | ------------------------------------------------------------------------------------------- |
+| `client_id`   | `string`  | 是   | 客户端ID,用于在服务器上区分不同用户的目录。                                                |
+| `path`        | `string`  | 是   | 需要分类的Excel文件的路径, 为上传文件的时候返回的列名                                       |
+| `name_column` | `string`  | 是   | Excel文件中包含需要分类的姓名列的列名。                                                     |
+| `chunk_size`  | `int`     | 否   | 默认100, 传入则修改 每次处理的数据块大小,用于分块处理数据。                                |
+| `one_key`     | `string`  | 是   | 用于识别Excel文件中唯一标识符的列名。                                                       |
+| `api_key`     | `string`  | 否   | 默认有openai的key, 传入则为用户的OpenAI API密钥,用于调用OpenAI的API进行数据分类。          |
+| `proxy`       | `boolean` | 否   | 默认为False, 传入则修改,是否使用代理(如 `true`表示使用代理,`false`表示不使用代理)。 |
+
+**说明**:
+
+- **client_id**: 该参数用于在服务器上创建或查找与客户端ID对应的目录,确保文件上传和处理的独立性。
+- **path**: 文件路径需要是服务器上实际文件的路径,确保该路径是正确的并且文件存在。
+- **name_column**: 确保该列名在Excel文件中存在,并且包含需要分类的姓名数据。
+- **chunk_size**: 分块大小决定了每次处理的数据量大小,较大的块大小可能会导致内存占用较高。
+- **one_key**: 唯一标识符列名用于确保数据的唯一性,避免重复处理。
+- **api_key**: OpenAI API的密钥,需确保密钥的正确性和权限。
+- **proxy**: 设置为 `true`时,通过代理进行API请求,通常用于网络受限的环境。
+
+**功能描述**:
+
+此接口用于对上传的Excel文件中的数据进行分类处理。接口会在服务器上检查并清理过期的文件和目录,然后根据用户提供的数据进行分类。分类完成后,返回结果文件的下载链接。
+
+**示例请求**:
+
+```bash
+curl -X POST "http://{server_address}/classify/" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "client_id": "12345",
+    "path": "./process/12345/your_file.xlsx",
+    "name_column": "姓名",
+    "chunk_size": 100,
+    "one_key": "ID",
+    "api_key": "your_openai_api_key",
+    "proxy": false
+  }'
+```
+
+**响应参数**:
+
+- 成功响应:
+
+```json
+{
+  "message": "分类完成",
+  "output_file": "http://{server_address}:8070/data/{client_id}/{output_file_name}"
+}
+```
+
+- 文件处理失败响应:
+
+```json
+{
+  "message": "文件没能处理成功"
+}
+```
+
+- 错误响应:
+
+```json
+{
+  "message": "处理出现错误: [错误信息]"
+}
+```

+ 12 - 0
tools/md2word.py

@@ -0,0 +1,12 @@
+import pypandoc
+
+def convert_md_to_docx(md_file_path, output_docx_path):
+    try:
+        # 使用 pandoc 将 markdown 文件转换为 docx 文件
+        pypandoc.convert_file(md_file_path, 'docx', outputfile=output_docx_path)
+        print(f"成功将 {md_file_path} 转换为 {output_docx_path}")
+    except Exception as e:
+        print(f"转换失败: {e}")
+
+# 调用函数
+convert_md_to_docx(r'E:\code\name_classify\readme.md', r'E:\code\name_classify\readme.docx')