浏览代码

增加readme, 提取config, 整理client

wangdalin 10 月之前
父节点
当前提交
e627f79d74
共有 6 个文件被更改,包括 318 次插入139 次删除
  1. 60 48
      client.py
  2. 23 0
      config.py
  3. 74 0
      functions.py
  4. 18 91
      name_classify_api.py
  5. 131 0
      readme.md
  6. 12 0
      tools/md2word.py

+ 60 - 48
client.py

@@ -1,48 +1,60 @@
-import requests
-
-# 定义请求的数据
-data = {
-    "path": "./process/wangdalin\\人群分类-自建站2024.08.xlsx",
-    "client_id": "wangdalin",
-    "one_key": "网店单号",
-    "name_column": "收货姓名",
-    "api_key": "sk-iREtaVNjamaBArOTlc_2BfGFJVPiU-9EjSFMUspIPBT3BlbkFJxS0SMmKZD9L9UumPczee4VKawCwVeGBQAr9MgsWGkA",
-    "proxy": False,  # 根据需要设置为 True 或 False
-    "chunk_size":100
-}
-
-# 发送 POST 请求
-response = requests.post("http://10.41.3.69:8070/classify/", json=data)
-
-# 处理响应
-if response.status_code == 200:
-    result = response.json()
-    print("接口响应:", result)
-else:
-    print(f"请求失败,状态码: {response.status_code}")
-    print("错误信息:", response.text)
-
-
-################################
-# # 文件路径
-# file_path = 'E:/code/name_classify/data_before/人群分类-自建站2024.08.xlsx'
-
-# # 表单数据
-# data = {
-#     'client_id': 'wangdalin'
-# }
-
-# # 发送 POST 请求
-# with open(file_path, 'rb') as file:
-#     files = {
-#         'file': ('人群分类-自建站2024.08.xlsx', file, 'text/plain')  # 'text/plain' 是文件类型,可以根据需要修改
-#     }
-#     response = requests.post("http://10.41.3.69:8070/uploadfile/", data=data, files=files)
-
-# # 处理响应
-# if response.status_code == 200:
-#     result = response.json()
-#     print("接口响应:", result)
-# else:
-#     print(f"请求失败,状态码: {response.status_code}")
-#     print("错误信息:", response.text)
+import requests, os
+
+def cls_process(path, client_id, one_key, name_column, api_key="sk-iREtaVNjamaBArOTlc_2BfGFJVPiU-9EjSFMUspIPBT3BlbkFJxS0SMmKZD9L9UumPczee4VKawCwVeGBQAr9MgsWGkA",
+                proxy=False, chunk_size=100):
+    # 定义请求的数据
+    data = {
+        "path": path,
+        "client_id": client_id,
+        "one_key": one_key,
+        "name_column": name_column,
+        "api_key": api_key,
+        "proxy": proxy,  
+        "chunk_size":chunk_size
+    }
+    # 发送 POST 请求
+    response = requests.post("http://10.41.3.69:8070/classify/", json=data)
+
+    # 处理响应
+    if response.status_code == 200:
+        result = response.json()
+        print("接口响应:", result)
+    else:
+        print(f"请求失败,状态码: {response.status_code}")
+        print("错误信息:", response.text)
+
+
+def upload_process(file_path, client_id):
+    # 表单数据
+    data = {
+        'client_id': client_id
+    }
+
+    # 发送 POST 请求
+    with open(file_path, 'rb') as file:
+        name = os.path.basename(file_path)
+        files = {
+            'file': (name, file, 'text/plain')  # 'text/plain' 是文件类型,可以根据需要修改
+        }
+        response = requests.post("http://10.41.3.69:8070/uploadfile/", data=data, files=files)
+
+    # 处理响应
+    if response.status_code == 200:
+        result = response.json()
+        print("接口响应:", result)
+        return result.get('file_path')
+    else:
+        print(f"请求失败,状态码: {response.status_code}")
+        print("错误信息:", response.text)
+        return None
+
+def both_process(path, client_id, one_key, name_column):
+    file_path = upload_process(file_path=path, client_id=client_id)
+    if file_path:
+        cls_process(path=file_path, client_id=client_id, one_key=one_key, name_column=name_column)
+
+if __name__ == "__main__":
+    both_process(path="E:/code/name_classify/data_before/人群分类-自建站2024.08.xlsx", client_id='wangdalin', one_key='网店单号', name_column='收货姓名')
+    # upload_process(file_path="E:/code/name_classify/data_before/人群分类-自建站2024.08.xlsx", client_id='wangdalin')
+    # cls_process(path="./process/wangdalin\\人群分类-自建站2024.08.xlsx",client_id='wangdalin', one_key='网店单号', name_column='收货姓名')
+

+ 23 - 0
config.py

@@ -0,0 +1,23 @@
+import socket
+def get_ip_address():
+    try:
+        # 创建一个socket对象
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        # 连接到一个公共DNS服务器(此处以Google DNS为例)
+        s.connect(("8.8.8.8", 80))
+        # 获取本机的IP地址
+        ip_address = s.getsockname()[0]
+        s.close()
+        return ip_address
+    except Exception as e:
+        return str(e)
+api_key = "sk-iREtaVNjamaBArOTlc_2BfGFJVPiU-9EjSFMUspIPBT3BlbkFJxS0SMmKZD9L9UumPczee4VKawCwVeGBQAr9MgsWGkA"
+basic_path = './process'
+openai_url = 'https://api.openai.com/v1'
+proxy_url = 'https://fast.bemore.lol/v1'
+cls_system_prompt = '你是一个名字判断专家,你需要根据提供的列表中的每一个字典元素的会员姓名,判断其名字分类,分别为3类: 亚裔华人,亚裔非华人, 非亚裔,并将结果填充到会员分类中, 整合之后返回与提供数据一样的格式给我'
+user_prompt = """提供的数据:{chunk}
+                返回的数据:"""
+port = 8070
+file_base_url = f'http://{get_ip_address()}:{port}/data'
+

+ 74 - 0
functions.py

@@ -0,0 +1,74 @@
+import re, ast, requests
+import math, json
+def save_dict_to_json(dictionary, filename):
+    with open(filename, 'w', encoding='utf-8') as file:
+        json.dump(dictionary, file, ensure_ascii=False, indent=4)
+
+def load_dict_from_json(filename):
+    with open(filename, 'r', encoding='utf-8') as file:
+        return json.load(file)
+def split_dataframe_to_dict(df, chunk_size=100):
+    # 计算需要切割的份数
+    num_chunks = math.ceil(len(df) / chunk_size)
+    
+    # 用于存储结果的字典
+    result_dict = {}
+    
+    for i in range(num_chunks):
+        # 切割 DataFrame
+        start = i * chunk_size
+        end = min((i + 1) * chunk_size, len(df))
+        chunk = df.iloc[start:end]
+        
+        # 将切割后的 DataFrame 转换为字典并存储
+        result_dict[f'chunk_{i+1}'] = chunk.to_dict(orient='records')
+    
+    return result_dict
+def extract_list_from_string(input_string):
+    # 使用正则表达式查找列表部分
+    list_pattern = r'\[.*?\]'
+    match = re.search(list_pattern, input_string, re.DOTALL)
+    
+    if match:
+        list_string = match.group()
+        try:
+            # 使用 ast.literal_eval 安全地解析字符串
+            result = ast.literal_eval(list_string)
+            
+            # 检查结果是否为列表
+            if isinstance(result, list):
+                return result
+            else:
+                print("解析结果不是列表")
+                return None
+        except Exception as e:
+            print(f"解析错误: {e}")
+            return None
+    else:
+        print("未找到列表结构")
+        return None
+def post_openai(messages):
+   Baseurl = "https://fast.bemore.lol"
+   Skey = "sk-dxl4rt2wWswbdrCr1c7b8500B68c43F5B6175b90F7D672C4"
+   payload = json.dumps({
+      "model": "gpt-4",
+      "messages": messages
+   })
+   url = Baseurl + "/v1/chat/completions"
+   headers = {
+      'Accept': 'application/json',
+      'Authorization': f'Bearer {Skey}',
+      'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
+      'Content-Type': 'application/json'
+   }
+
+   response = requests.request("POST", url, headers=headers, data=payload)
+
+   # 解析 JSON 数据为 Python 字典
+   print(response)
+   data = response.json()
+
+   # 获取 content 字段的值
+   content = data['choices'][0]['message']['content']
+
+   return content

+ 18 - 91
name_classify_api.py

@@ -1,104 +1,33 @@
 import pandas as pd 
 import pandas as pd 
-import math, json, os, time, shutil
-import openai, re, ast, requests
+import os, time, shutil
+import openai
 from fastapi import FastAPI, UploadFile, File, Form
 from fastapi import FastAPI, UploadFile, File, Form
 from pydantic import BaseModel
 from pydantic import BaseModel
 from fastapi.responses import JSONResponse
 from fastapi.responses import JSONResponse
 from datetime import datetime
 from datetime import datetime
-import uvicorn
+import uvicorn, socket
 from tqdm import tqdm
 from tqdm import tqdm
+from fastapi.staticfiles import StaticFiles
+from config import *
+from functions import split_dataframe_to_dict, extract_list_from_string
 app = FastAPI()
 app = FastAPI()
+app.mount("/data", StaticFiles(directory='./process'), name="static")
 class ClassificationRequest(BaseModel):
 class ClassificationRequest(BaseModel):
     path: str
     path: str
     client_id: str
     client_id: str
     one_key: str
     one_key: str
     name_column: str
     name_column: str
-    api_key: str
-    proxy: bool
-    chunk_size: int
-def save_dict_to_json(dictionary, filename):
-    with open(filename, 'w', encoding='utf-8') as file:
-        json.dump(dictionary, file, ensure_ascii=False, indent=4)
-
-def load_dict_from_json(filename):
-    with open(filename, 'r', encoding='utf-8') as file:
-        return json.load(file)
-def split_dataframe_to_dict(df, chunk_size=100):
-    # 计算需要切割的份数
-    num_chunks = math.ceil(len(df) / chunk_size)
-    
-    # 用于存储结果的字典
-    result_dict = {}
-    
-    for i in range(num_chunks):
-        # 切割 DataFrame
-        start = i * chunk_size
-        end = min((i + 1) * chunk_size, len(df))
-        chunk = df.iloc[start:end]
-        
-        # 将切割后的 DataFrame 转换为字典并存储
-        result_dict[f'chunk_{i+1}'] = chunk.to_dict(orient='records')
-    
-    return result_dict
-def extract_list_from_string(input_string):
-    # 使用正则表达式查找列表部分
-    list_pattern = r'\[.*?\]'
-    match = re.search(list_pattern, input_string, re.DOTALL)
-    
-    if match:
-        list_string = match.group()
-        try:
-            # 使用 ast.literal_eval 安全地解析字符串
-            result = ast.literal_eval(list_string)
-            
-            # 检查结果是否为列表
-            if isinstance(result, list):
-                return result
-            else:
-                print("解析结果不是列表")
-                return None
-        except Exception as e:
-            print(f"解析错误: {e}")
-            return None
-    else:
-        print("未找到列表结构")
-        return None
-def post_openai(messages):
-   Baseurl = "https://fast.bemore.lol"
-   Skey = "sk-dxl4rt2wWswbdrCr1c7b8500B68c43F5B6175b90F7D672C4"
-   payload = json.dumps({
-      "model": "gpt-4",
-      "messages": messages
-   })
-   url = Baseurl + "/v1/chat/completions"
-   headers = {
-      'Accept': 'application/json',
-      'Authorization': f'Bearer {Skey}',
-      'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
-      'Content-Type': 'application/json'
-   }
-
-   response = requests.request("POST", url, headers=headers, data=payload)
-
-   # 解析 JSON 数据为 Python 字典
-   print(response)
-   data = response.json()
-
-   # 获取 content 字段的值
-   content = data['choices'][0]['message']['content']
-
-   return content
+    api_key: str = "sk-iREtaVNjamaBArOTlc_2BfGFJVPiU-9EjSFMUspIPBT3BlbkFJxS0SMmKZD9L9UumPczee4VKawCwVeGBQAr9MgsWGkA"
+    proxy: bool = False
+    chunk_size: int = 100
 
 
 @app.post("/uploadfile/")
 @app.post("/uploadfile/")
 async def create_upload_file(file: UploadFile = File(...), client_id: str = Form(...)):
 async def create_upload_file(file: UploadFile = File(...), client_id: str = Form(...)):
-    user_directory = f'./process/{client_id}'
+    user_directory = f'{basic_path}/{client_id}'
     if not os.path.exists(user_directory):
     if not os.path.exists(user_directory):
         os.makedirs(user_directory)
         os.makedirs(user_directory)
         os.chmod(user_directory, 0o777)  # 设置用户目录权限为777
         os.chmod(user_directory, 0o777)  # 设置用户目录权限为777
-    print(user_directory)
-    print(file.filename)
     file_location = os.path.join(user_directory, file.filename)
     file_location = os.path.join(user_directory, file.filename)
-    print(file_location)
     try:
     try:
         with open(file_location, "wb+") as file_object:
         with open(file_location, "wb+") as file_object:
             file_object.write(file.file.read())
             file_object.write(file.file.read())
@@ -117,7 +46,6 @@ async def classify_data(request: ClassificationRequest):
         current_time = time.time()
         current_time = time.time()
         TIME_THRESHOLD_FILEPATH = 30 * 24 * 60 * 60
         TIME_THRESHOLD_FILEPATH = 30 * 24 * 60 * 60
         TIME_THRESHOLD_FILE = 10 * 24 * 60 * 60
         TIME_THRESHOLD_FILE = 10 * 24 * 60 * 60
-        basic_path = './process'
         for root, dirs, files in os.walk(basic_path, topdown=False):
         for root, dirs, files in os.walk(basic_path, topdown=False):
             # 删除文件
             # 删除文件
             for file in files:
             for file in files:
@@ -131,8 +59,7 @@ async def classify_data(request: ClassificationRequest):
                 if current_time - os.path.getmtime(dir_path) > TIME_THRESHOLD_FILEPATH:
                 if current_time - os.path.getmtime(dir_path) > TIME_THRESHOLD_FILEPATH:
                     print(f"删除文件夹: {dir_path}")
                     print(f"删除文件夹: {dir_path}")
                     shutil.rmtree(dir_path)
                     shutil.rmtree(dir_path)
-        prompt = """提供的数据:{chunk}
-                返回的数据:"""
+        
         work_path = f'{basic_path}/{request.client_id}'
         work_path = f'{basic_path}/{request.client_id}'
         if not os.path.exists(work_path):
         if not os.path.exists(work_path):
             os.makedirs(work_path, exist_ok=True)
             os.makedirs(work_path, exist_ok=True)
@@ -154,14 +81,14 @@ async def classify_data(request: ClassificationRequest):
 
 
         if not request.proxy:
         if not request.proxy:
             print(f'用户{request.client_id}正在使用直连的gpt-API')
             print(f'用户{request.client_id}正在使用直连的gpt-API')
-            client = openai.OpenAI(api_key=request.api_key, base_url='https://api.openai.com/v1')
+            client = openai.OpenAI(api_key=request.api_key, base_url=openai_url)
         else:
         else:
-            client = openai.OpenAI(api_key=request.api_key, base_url='https://fast.bemore.lol/v1')
+            client = openai.OpenAI(api_key=request.api_key, base_url=proxy_url)
         for name, value in tqdm(deal_result.items(), desc='Processing', unit='item'):
         for name, value in tqdm(deal_result.items(), desc='Processing', unit='item'):
             try:
             try:
                 message = [
                 message = [
-                    {'role':'system', 'content': '你是一个名字判断专家,你需要根据提供的列表中的每一个字典元素的会员姓名,判断其名字分类,分别为3类: 亚裔华人,亚裔非华人, 非亚裔,并将结果填充到会员分类中, 整合之后返回与提供数据一样的格式给我'},
-                    {'role':'user', 'content':prompt.format(chunk=str(value))}
+                    {'role':'system', 'content': cls_system_prompt},
+                    {'role':'user', 'content':user_prompt.format(chunk=str(value))}
                 ]
                 ]
                 # result_string = post_openai(message)
                 # result_string = post_openai(message)
                 response = client.chat.completions.create(model='gpt-4',messages=message)
                 response = client.chat.completions.create(model='gpt-4',messages=message)
@@ -178,11 +105,11 @@ async def classify_data(request: ClassificationRequest):
             df_result = pd.read_csv(temp_csv)
             df_result = pd.read_csv(temp_csv)
             df_final = df_origin.merge(df_result, on='name', how='left').drop_duplicates(subset=[request.one_key,'name'], keep='first')
             df_final = df_origin.merge(df_result, on='name', how='left').drop_duplicates(subset=[request.one_key,'name'], keep='first')
             df_final.to_excel(new_file_path)
             df_final.to_excel(new_file_path)
-            return {"message": "分类完成", "output_file": new_file_path}
+            return {"message": "分类完成", "output_file": file_base_url + new_file_path.split(basic_path)[1]}
         else:
         else:
             return {"message": "文件没能处理成功"}
             return {"message": "文件没能处理成功"}
     except Exception as e:
     except Exception as e:
         return {"message": f"处理出现错误: {e}"}
         return {"message": f"处理出现错误: {e}"}
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8070)
+    uvicorn.run(app, host="0.0.0.0", port=port)

+ 131 - 0
readme.md

@@ -0,0 +1,131 @@
+以下是这两个接口的接口说明文档,使用Markdown格式编写:
+
+---
+
+## API 接口文档
+
+### 1. 文件上传接口
+
+**接口路径**: `/upload/`
+
+**请求方法**: `POST`
+
+**请求参数**:
+
+| 参数名称      | 类型           | 必须 | 描述                       |
+| ------------- | -------------- | ---- | -------------------------- |
+| `file`      | `UploadFile` | 是   | 要上传的文件               |
+| `client_id` | `string`     | 是   | 客户端ID,用于创建用户目录 |
+
+**功能描述**:
+
+此接口用于上传文件,并将文件存储到服务器上的指定目录中。每个客户端都有自己的独立目录。上传文件后,系统会为文件和目录设置权限。
+
+**示例请求**:
+
+```bash
+curl -X POST "http://{server_address}/upload/" \
+  -F "file=@path/to/your/file.xlsx" \
+  -F "client_id=12345"
+```
+
+**响应参数**:
+
+- 成功响应:
+
+```json
+{
+  "message": "文件 'your_file.xlsx' 上传成功",
+  "client_id": "12345",
+  "file_path": "./process/12345/your_file.xlsx"
+}
+```
+
+- 错误响应:
+
+```json
+{
+  "message": "发生错误: [错误信息]"
+}
+```
+
+---
+
+### 2. 数据分类接口
+
+**接口路径**: `/classify/`
+
+**请求方法**: `POST`
+
+**请求参数**:
+
+
+
+数据分类接口参数列表
+
+| 参数名称        | 类型        | 必须 | 描述                                                                                        |
+| --------------- | ----------- | ---- | ------------------------------------------------------------------------------------------- |
+| `client_id`   | `string`  | 是   | 客户端ID,用于在服务器上区分不同用户的目录。                                                |
+| `path`        | `string`  | 是   | 需要分类的Excel文件的路径, 为上传文件的时候返回的列名                                       |
+| `name_column` | `string`  | 是   | Excel文件中包含需要分类的姓名列的列名。                                                     |
+| `chunk_size`  | `int`     | 否   | 默认100, 传入则修改 每次处理的数据块大小,用于分块处理数据。                                |
+| `one_key`     | `string`  | 是   | 用于识别Excel文件中唯一标识符的列名。                                                       |
+| `api_key`     | `string`  | 否   | 默认有openai的key, 传入则为用户的OpenAI API密钥,用于调用OpenAI的API进行数据分类。          |
+| `proxy`       | `boolean` | 否   | 默认为False, 传入则修改,是否使用代理(如 `true`表示使用代理,`false`表示不使用代理)。 |
+
+**说明**:
+
+- **client_id**: 该参数用于在服务器上创建或查找与客户端ID对应的目录,确保文件上传和处理的独立性。
+- **path**: 文件路径需要是服务器上实际文件的路径,确保该路径是正确的并且文件存在。
+- **name_column**: 确保该列名在Excel文件中存在,并且包含需要分类的姓名数据。
+- **chunk_size**: 分块大小决定了每次处理的数据量大小,较大的块大小可能会导致内存占用较高。
+- **one_key**: 唯一标识符列名用于确保数据的唯一性,避免重复处理。
+- **api_key**: OpenAI API的密钥,需确保密钥的正确性和权限。
+- **proxy**: 设置为 `true`时,通过代理进行API请求,通常用于网络受限的环境。
+
+**功能描述**:
+
+此接口用于对上传的Excel文件中的数据进行分类处理。接口会在服务器上检查并清理过期的文件和目录,然后根据用户提供的数据进行分类。分类完成后,返回结果文件的下载链接。
+
+**示例请求**:
+
+```bash
+curl -X POST "http://{server_address}/classify/" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "client_id": "12345",
+    "path": "./process/12345/your_file.xlsx",
+    "name_column": "姓名",
+    "chunk_size": 100,
+    "one_key": "ID",
+    "api_key": "your_openai_api_key",
+    "proxy": false
+  }'
+```
+
+**响应参数**:
+
+- 成功响应:
+
+```json
+{
+  "message": "分类完成",
+  "output_file": "http://{server_address}:8070/data/{client_id}/{output_file_name}"
+}
+```
+
+- 文件处理失败响应:
+
+```json
+{
+  "message": "文件没能处理成功"
+}
+```
+
+- 错误响应:
+
+```json
+{
+  "message": "处理出现错误: [错误信息]"
+}
+```

+ 12 - 0
tools/md2word.py

@@ -0,0 +1,12 @@
+import pypandoc
+
+def convert_md_to_docx(md_file_path, output_docx_path):
+    try:
+        # 使用 pandoc 将 markdown 文件转换为 docx 文件
+        pypandoc.convert_file(md_file_path, 'docx', outputfile=output_docx_path)
+        print(f"成功将 {md_file_path} 转换为 {output_docx_path}")
+    except Exception as e:
+        print(f"转换失败: {e}")
+
+# 调用函数
+convert_md_to_docx(r'E:\code\name_classify\readme.md', r'E:\code\name_classify\readme.docx')