name_classify_api.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. import pandas as pd
  2. import math, json, os
  3. import openai, re, ast, requests
  4. from fastapi import FastAPI, UploadFile, File, Form
  5. from pydantic import BaseModel
  6. from fastapi.responses import JSONResponse
  7. from datetime import datetime
  8. import uvicorn
  9. from tqdm import tqdm
  10. app = FastAPI()
  11. class ClassificationRequest(BaseModel):
  12. path: str
  13. client_id: str
  14. one_key: str
  15. name_column: str
  16. api_key: str
  17. proxy: bool
  18. chunk_size: int
  19. def save_dict_to_json(dictionary, filename):
  20. with open(filename, 'w', encoding='utf-8') as file:
  21. json.dump(dictionary, file, ensure_ascii=False, indent=4)
  22. def load_dict_from_json(filename):
  23. with open(filename, 'r', encoding='utf-8') as file:
  24. return json.load(file)
  25. def split_dataframe_to_dict(df, chunk_size=100):
  26. # 计算需要切割的份数
  27. num_chunks = math.ceil(len(df) / chunk_size)
  28. # 用于存储结果的字典
  29. result_dict = {}
  30. for i in range(num_chunks):
  31. # 切割 DataFrame
  32. start = i * chunk_size
  33. end = min((i + 1) * chunk_size, len(df))
  34. chunk = df.iloc[start:end]
  35. # 将切割后的 DataFrame 转换为字典并存储
  36. result_dict[f'chunk_{i+1}'] = chunk.to_dict(orient='records')
  37. return result_dict
  38. def extract_list_from_string(input_string):
  39. # 使用正则表达式查找列表部分
  40. list_pattern = r'\[.*?\]'
  41. match = re.search(list_pattern, input_string, re.DOTALL)
  42. if match:
  43. list_string = match.group()
  44. try:
  45. # 使用 ast.literal_eval 安全地解析字符串
  46. result = ast.literal_eval(list_string)
  47. # 检查结果是否为列表
  48. if isinstance(result, list):
  49. return result
  50. else:
  51. print("解析结果不是列表")
  52. return None
  53. except Exception as e:
  54. print(f"解析错误: {e}")
  55. return None
  56. else:
  57. print("未找到列表结构")
  58. return None
  59. def post_openai(messages):
  60. Baseurl = "https://fast.bemore.lol"
  61. Skey = "sk-dxl4rt2wWswbdrCr1c7b8500B68c43F5B6175b90F7D672C4"
  62. payload = json.dumps({
  63. "model": "gpt-4",
  64. "messages": messages
  65. })
  66. url = Baseurl + "/v1/chat/completions"
  67. headers = {
  68. 'Accept': 'application/json',
  69. 'Authorization': f'Bearer {Skey}',
  70. 'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
  71. 'Content-Type': 'application/json'
  72. }
  73. response = requests.request("POST", url, headers=headers, data=payload)
  74. # 解析 JSON 数据为 Python 字典
  75. print(response)
  76. data = response.json()
  77. # 获取 content 字段的值
  78. content = data['choices'][0]['message']['content']
  79. return content
  80. @app.post("/uploadfile/")
  81. async def create_upload_file(file: UploadFile = File(...), client_id: str = Form(...)):
  82. user_directory = f'./process/{client_id}'
  83. if not os.path.exists(user_directory):
  84. os.makedirs(user_directory)
  85. os.chmod(user_directory, 0o777) # 设置用户目录权限为777
  86. print(user_directory)
  87. print(file.filename)
  88. file_location = os.path.join(user_directory, file.filename)
  89. print(file_location)
  90. try:
  91. with open(file_location, "wb+") as file_object:
  92. file_object.write(file.file.read())
  93. os.chmod(file_location, 0o777) # 设置文件权限为777
  94. return JSONResponse(content={
  95. "message": f"文件 '{file.filename}' 上传成功",
  96. "client_id": client_id,
  97. "file_path": file_location
  98. }, status_code=200)
  99. except Exception as e:
  100. return JSONResponse(content={"message": f"发生错误: {str(e)}"}, status_code=500)
  101. @app.post("/classify/")
  102. async def classify_data(request: ClassificationRequest):
  103. try:
  104. prompt = """提供的数据:{chunk}
  105. 返回的数据:"""
  106. work_path = f'./process/{request.client_id}'
  107. if not os.path.exists(work_path):
  108. os.makedirs(work_path, exist_ok=True)
  109. timestamp_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
  110. df_origin = pd.read_excel(request.path)
  111. df_origin['name'] = df_origin[request.name_column]
  112. df_origin['classify'] = ''
  113. df_use = df_origin[['name', 'classify']]
  114. deal_result = split_dataframe_to_dict(df_use, request.chunk_size)
  115. # 生成当前时间的时间戳字符串
  116. temp_csv = work_path + '/' + timestamp_str + 'output_temp.csv'
  117. final_file_name, final_file_extension = os.path.splitext(os.path.basename(request.path))
  118. # 添加后缀
  119. final_file = final_file_name + '_classify' + final_file_extension
  120. # 生成新的文件路径
  121. new_file_path = os.path.join(os.path.dirname(request.path), final_file)
  122. if not request.proxy:
  123. print(f'用户{request.client_id}正在使用直连的gpt-API')
  124. client = openai.OpenAI(api_key=request.api_key, base_url='https://api.openai.com/v1')
  125. else:
  126. client = openai.OpenAI(api_key=request.api_key, base_url='https://fast.bemore.lol/v1')
  127. for name, value in tqdm(deal_result.items(), desc='Processing', unit='item'):
  128. try:
  129. message = [
  130. {'role':'system', 'content': '你是一个名字判断专家,你需要根据提供的列表中的每一个字典元素的会员姓名,判断其名字分类,分别为3类: 亚裔华人,亚裔非华人, 非亚裔,并将结果填充到会员分类中, 整合之后返回与提供数据一样的格式给我'},
  131. {'role':'user', 'content':prompt.format(chunk=str(value))}
  132. ]
  133. # result_string = post_openai(message)
  134. response = client.chat.completions.create(model='gpt-4',messages=message)
  135. result_string = response.choices[0].message.content
  136. result = extract_list_from_string(result_string)
  137. if result:
  138. df_output = pd.DataFrame(result)
  139. df_output.to_csv(temp_csv, mode='a', header=True, index=False)
  140. else:
  141. continue
  142. except Exception as e:
  143. print(f'{name}出现问题啦, 错误为:{e} 请自行调试')
  144. if os.path.exists(temp_csv):
  145. df_result = pd.read_csv(temp_csv)
  146. df_final = df_origin.merge(df_result, on='name', how='left').drop_duplicates(subset=[request.one_key,'name'], keep='first')
  147. df_final.to_excel(new_file_path)
  148. return {"message": "分类完成", "output_file": new_file_path}
  149. else:
  150. return {"message": "文件没能处理成功"}
  151. except Exception as e:
  152. return {"message": f"处理出现错误: {e}"}
  153. if __name__ == "__main__":
  154. uvicorn.run(app, host="0.0.0.0", port=8070)