name_classify.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. import pandas as pd
  2. import math, json
  3. import openai, re, ast, requests
  4. # client = openai.OpenAI(api_key='sk-dxl4rt2wWswbdrCr1c7b8500B68c43F5B6175b90F7D672C4', base_url='https://fast.bemore.lol')
  5. # client = openai.OpenAI(api_key='sk-proj-d9t4UfFWe65DE9KHalwTT3BlbkFJTj0H36OsRBnu9Te9KOL7', base_url='https://api.openai.com/v1')
  6. client = openai.OpenAI(api_key='sk-proj-LA7DMHievtKRMZfQwZ1OT3BlbkFJmLMamYoZ5yWynUHOYtZ2', base_url='https://api.openai.com/v1')
  7. prompt = """提供的数据:{chunk}
  8. 返回的数据:"""
  9. proxy = False
  10. next_iter = True
  11. message = [
  12. {'role':'system', 'content': '你是一个名字判断专家,你需要根据提供的列表中的每一个字典元素的会员姓名,判断其是否为华裔的名字,结果为 华裔 或者为 非华裔,并将结果填充到会员分类中, 整合之后返回与提供数据一样的格式给我'},
  13. {'role':'user', 'content':'你好,你是谁'}
  14. ]
  15. if not proxy:
  16. response = client.chat.completions.create(model='gpt-4',messages=message)
  17. result_string = response.choices[0].message.content
  18. print(result_string)
  19. def save_dict_to_json(dictionary, filename):
  20. with open(filename, 'w', encoding='utf-8') as file:
  21. json.dump(dictionary, file, ensure_ascii=False, indent=4)
  22. def load_dict_from_json(filename):
  23. with open(filename, 'r', encoding='utf-8') as file:
  24. return json.load(file)
  25. def split_dataframe_to_dict(df, chunk_size=100):
  26. # 计算需要切割的份数
  27. num_chunks = math.ceil(len(df) / chunk_size)
  28. # 用于存储结果的字典
  29. result_dict = {}
  30. for i in range(num_chunks):
  31. # 切割 DataFrame
  32. start = i * chunk_size
  33. end = min((i + 1) * chunk_size, len(df))
  34. chunk = df.iloc[start:end]
  35. # 将切割后的 DataFrame 转换为字典并存储
  36. result_dict[f'chunk_{i+1}'] = chunk.to_dict(orient='records')
  37. return result_dict
  38. def extract_list_from_string(input_string):
  39. # 使用正则表达式查找列表部分
  40. list_pattern = r'\[.*?\]'
  41. match = re.search(list_pattern, input_string, re.DOTALL)
  42. if match:
  43. list_string = match.group()
  44. try:
  45. # 使用 ast.literal_eval 安全地解析字符串
  46. result = ast.literal_eval(list_string)
  47. # 检查结果是否为列表
  48. if isinstance(result, list):
  49. return result
  50. else:
  51. print("解析结果不是列表")
  52. return None
  53. except Exception as e:
  54. print(f"解析错误: {e}")
  55. return None
  56. else:
  57. print("未找到列表结构")
  58. return None
  59. def post_openai(messages):
  60. Baseurl = "https://fast.bemore.lol"
  61. Skey = "sk-dxl4rt2wWswbdrCr1c7b8500B68c43F5B6175b90F7D672C4"
  62. payload = json.dumps({
  63. "model": "gpt-4",
  64. "messages": messages
  65. })
  66. url = Baseurl + "/v1/chat/completions"
  67. headers = {
  68. 'Accept': 'application/json',
  69. 'Authorization': f'Bearer {Skey}',
  70. 'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
  71. 'Content-Type': 'application/json'
  72. }
  73. response = requests.request("POST", url, headers=headers, data=payload)
  74. # 解析 JSON 数据为 Python 字典
  75. print(response)
  76. data = response.json()
  77. # 获取 content 字段的值
  78. content = data['choices'][0]['message']['content']
  79. return content
  80. # 示例使用
  81. # 假设你有一个名为 'your_dataframe' 的 DataFrame
  82. if not next_iter:
  83. df = pd.read_excel('会员列表.xlsx')
  84. df['会员分类'] = ''
  85. deal_result = split_dataframe_to_dict(df)
  86. save_dict_to_json(deal_result,'save.json')
  87. df_all = pd.DataFrame(columns=['会员姓名', '会员分类'])
  88. df_all.to_csv('会员列表.csv',mode='a', header=True, index=False)
  89. else:
  90. deal_result = load_dict_from_json('save.json')
  91. keys_to_remove = [f'chunk_{i+1}' for i in range(43)]
  92. # keys_to_remove.remove('chunk_17')
  93. deal_result = {k: v for k, v in deal_result.items() if k not in keys_to_remove}
  94. save_dict_to_json(deal_result,'save_new.json')
  95. print(deal_result.keys())
  96. for name, value in deal_result.items():
  97. print(name)
  98. message = [
  99. {'role':'system', 'content': '你是一个名字判断专家,你需要根据提供的列表中的每一个字典元素的会员姓名,判断其是否为华裔的名字,结果为 华裔 或者为 非华裔,并将结果填充到会员分类中, 整合之后返回与提供数据一样的格式给我'},
  100. {'role':'user', 'content':prompt.format(chunk=str(value))}
  101. ]
  102. if not proxy:
  103. response = client.chat.completions.create(model='gpt-4',messages=message)
  104. result_string = response.choices[0].message.content
  105. else:
  106. result_string = post_openai(message)
  107. result = extract_list_from_string(result_string)
  108. if result:
  109. df = pd.DataFrame(result)
  110. df.to_csv('会员列表.csv',mode='a', header=False, index=False)
  111. # df_all = pd.concat([df_all, df], ignore_index=True)
  112. print(df.info())
  113. print(f'{name}处理完成, 请验收!')
  114. else:
  115. print(f'{name}出现问题啦, 请自行调试')
  116. # 数据清洗模块
  117. df_origin = pd.read_excel('会员列表.xlsx')
  118. df_result = pd.read_csv('会员列表.csv')
  119. df_result['会员分类'] = df_result['会员分类'].apply(lambda x : str(x).replace('Non-Chinese', '非华裔').replace('non-Chinese', '非华裔').replace('Chinese','华裔').replace('可能是华裔','华裔')\
  120. .replace('可能华裔','华裔').replace('非华人','华裔').replace('华人','非华裔').replace('是','华裔').replace('否','非华裔'))
  121. df_origin_use = df_origin[['会员姓名']]
  122. df_result.to_excel('加分类的原始表.xlsx')
  123. print('*******************************')
  124. df_origin_use = df_origin_use.drop_duplicates(subset=['会员姓名'])
  125. print('*******************************')
  126. df_final = df_origin_use.merge(df_result, on='会员姓名', how='left')
  127. df_final = df_final.drop_duplicates(subset=['会员姓名'])
  128. df_final.to_excel('加分类并对原表对应.xlsx')