name_classify.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. import pandas as pd
  2. import math, json
  3. import openai, re, ast, requests
  4. # client = openai.OpenAI(api_key='sk-VG24X609Gy3L7U4Q995d82Ad95A742B4A3E873404c1067D1', base_url='https://fast.bemore.lol/v1')
  5. client = openai.OpenAI(api_key='sk-proj-d9t4UfFWe65DE9KHalwTT3BlbkFJTj0H36OsRBnu9Te9KOL7', base_url='https://api.openai.com/v1')
  6. # client = openai.OpenAI(api_key='sk-proj-LA7DMHievtKRMZfQwZ1OT3BlbkFJmLMamYoZ5yWynUHOYtZ2', base_url='https://api.openai.com/v1')
  7. #xiaohua
  8. # client = openai.OpenAI(api_key='sk-7VeOfQrKWINHGse3XErMp2C3CNLWEstyETei7b1gTzT3BlbkFJ6iBHq4UM6xt2B3Z6h7AFr7RSpTe0wT9d_6ETBJqC0A', base_url='https://api.openai.com/v1')
  9. # client = openai.OpenAI(api_key='sk-proj-LA7DMHievtKRMZfQwZ1OT3BlbkFJmLMamYoZ5yWynUHOYtZ2', base_url='https://api.openai.com/v1')
  10. prompt = """提供的数据:{chunk}
  11. 返回的数据:"""
  12. proxy = False
  13. next_iter = False
  14. # message = [
  15. # {'role':'system', 'content': '你是一个名字判断专家,你需要根据提供的列表中的每一个字典元素的会员姓名,判断其是否为华裔的名字,结果为 华裔 或者为 非华裔,并将结果填充到会员分类中, 整合之后返回与提供数据一样的格式给我'},
  16. # {'role':'user', 'content':'你好,你是谁'}
  17. # ]
  18. # if not proxy:
  19. # response = client.chat.completions.create(model='gpt-4',messages=message)
  20. # result_string = response.choices[0].message.content
  21. # else:
  22. # response = client.chat.completions.create(model='gpt-4',messages=message)
  23. # result_string = response.choices[0].message.content
  24. # print(result_string)
  25. def save_dict_to_json(dictionary, filename):
  26. with open(filename, 'w', encoding='utf-8') as file:
  27. json.dump(dictionary, file, ensure_ascii=False, indent=4)
  28. def load_dict_from_json(filename):
  29. with open(filename, 'r', encoding='utf-8') as file:
  30. return json.load(file)
  31. def split_dataframe_to_dict(df, chunk_size=100):
  32. # 计算需要切割的份数
  33. num_chunks = math.ceil(len(df) / chunk_size)
  34. # 用于存储结果的字典
  35. result_dict = {}
  36. for i in range(num_chunks):
  37. # 切割 DataFrame
  38. start = i * chunk_size
  39. end = min((i + 1) * chunk_size, len(df))
  40. chunk = df.iloc[start:end]
  41. # 将切割后的 DataFrame 转换为字典并存储
  42. result_dict[f'chunk_{i+1}'] = chunk.to_dict(orient='records')
  43. return result_dict
  44. def extract_list_from_string(input_string):
  45. # 使用正则表达式查找列表部分
  46. list_pattern = r'\[.*?\]'
  47. match = re.search(list_pattern, input_string, re.DOTALL)
  48. if match:
  49. list_string = match.group()
  50. try:
  51. # 使用 ast.literal_eval 安全地解析字符串
  52. result = ast.literal_eval(list_string)
  53. # 检查结果是否为列表
  54. if isinstance(result, list):
  55. return result
  56. else:
  57. print("解析结果不是列表")
  58. return None
  59. except Exception as e:
  60. print(f"解析错误: {e}")
  61. return None
  62. else:
  63. print("未找到列表结构")
  64. return None
  65. def post_openai(messages):
  66. Baseurl = "https://fast.bemore.lol"
  67. Skey = "sk-dxl4rt2wWswbdrCr1c7b8500B68c43F5B6175b90F7D672C4"
  68. payload = json.dumps({
  69. "model": "gpt-4",
  70. "messages": messages
  71. })
  72. url = Baseurl + "/v1/chat/completions"
  73. headers = {
  74. 'Accept': 'application/json',
  75. 'Authorization': f'Bearer {Skey}',
  76. 'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
  77. 'Content-Type': 'application/json'
  78. }
  79. response = requests.request("POST", url, headers=headers, data=payload)
  80. # 解析 JSON 数据为 Python 字典
  81. print(response)
  82. data = response.json()
  83. # 获取 content 字段的值
  84. content = data['choices'][0]['message']['content']
  85. return content
  86. path = ''
  87. df = pd.read_excel(path)
  88. if not next_iter:
  89. df = pd.read_excel('会员列表.xlsx')
  90. df['会员分类'] = ''
  91. deal_result = split_dataframe_to_dict(df)
  92. save_dict_to_json(deal_result,'save.json')
  93. df_all = pd.DataFrame(columns=['会员姓名', '会员分类'])
  94. df_all.to_csv('会员列表.csv',mode='a', header=True, index=False)
  95. else:
  96. deal_result = load_dict_from_json('save.json')
  97. keys_to_remove = []
  98. # keys_to_remove = [f'chunk_{i+1}' for i in range(43)]
  99. # keys_to_remove.remove('chunk_17')
  100. deal_result = {k: v for k, v in deal_result.items() if k not in keys_to_remove}
  101. save_dict_to_json(deal_result,'save_new.json')
  102. print(deal_result.keys())
  103. for name, value in deal_result.items():
  104. print(name)
  105. message = [
  106. {'role':'system', 'content': '你是一个名字判断专家,你需要根据提供的列表中的每一个字典元素的会员姓名,判断其名字分类,分别为3类: 亚裔华人,亚裔非华人, 非亚裔,并将结果填充到会员分类中, 整合之后返回与提供数据一样的格式给我'},
  107. {'role':'user', 'content':prompt.format(chunk=str(value))}
  108. ]
  109. if not proxy:
  110. response = client.chat.completions.create(model='gpt-4',messages=message)
  111. result_string = response.choices[0].message.content
  112. else:
  113. result_string = post_openai(message)
  114. result = extract_list_from_string(result_string)
  115. if result:
  116. df = pd.DataFrame(result)
  117. df.to_csv('会员列表.csv',mode='a', header=False, index=False)
  118. # df_all = pd.concat([df_all, df], ignore_index=True)
  119. print(df.info())
  120. print(f'{name}处理完成, 请验收!')
  121. else:
  122. print(f'{name}出现问题啦, 请自行调试')
  123. df_origin = pd.read_excel('会员列表.xlsx')
  124. df_result = pd.read_csv('会员列表.csv')
  125. print(df_result['会员分类'].unique())
  126. df_result.to_excel('result.xlsx')
  127. df_result['会员分类'] = df_result['会员分类'].apply(lambda x : str(x).replace('Non-Chinese', '非华裔').replace('non-Chinese', '非华裔').replace('Chinese','华裔').replace('可能是华裔','华裔')\
  128. .replace('可能华裔','华裔').replace('非华人','华裔').replace('华人','非华裔').replace('是','华裔').replace('否','非华裔'))
  129. df_origin_use = df_origin[['会员姓名']]
  130. df_result.to_excel('加分类的原始表.xlsx')
  131. print('*******************************')
  132. df_origin_use = df_origin_use.drop_duplicates(subset=['会员姓名'])
  133. print('*******************************')
  134. df_final = df_origin_use.merge(df_result, on='会员姓名', how='left')
  135. df_final = df_final.drop_duplicates(subset=['会员姓名'])
  136. df_final.to_excel('加分类并对原表对应.xlsx')