123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146 |
- import pandas as pd
- import math, json
- import openai, re, ast, requests
- # client = openai.OpenAI(api_key='sk-dxl4rt2wWswbdrCr1c7b8500B68c43F5B6175b90F7D672C4', base_url='https://fast.bemore.lol')
- # client = openai.OpenAI(api_key='sk-proj-d9t4UfFWe65DE9KHalwTT3BlbkFJTj0H36OsRBnu9Te9KOL7', base_url='https://api.openai.com/v1')
- client = openai.OpenAI(api_key='sk-proj-LA7DMHievtKRMZfQwZ1OT3BlbkFJmLMamYoZ5yWynUHOYtZ2', base_url='https://api.openai.com/v1')
- prompt = """提供的数据:{chunk}
- 返回的数据:"""
- proxy = False
- next_iter = True
- message = [
- {'role':'system', 'content': '你是一个名字判断专家,你需要根据提供的列表中的每一个字典元素的会员姓名,判断其是否为华裔的名字,结果为 华裔 或者为 非华裔,并将结果填充到会员分类中, 整合之后返回与提供数据一样的格式给我'},
- {'role':'user', 'content':'你好,你是谁'}
- ]
- if not proxy:
- response = client.chat.completions.create(model='gpt-4',messages=message)
- result_string = response.choices[0].message.content
- print(result_string)
- def save_dict_to_json(dictionary, filename):
- with open(filename, 'w', encoding='utf-8') as file:
- json.dump(dictionary, file, ensure_ascii=False, indent=4)
- def load_dict_from_json(filename):
- with open(filename, 'r', encoding='utf-8') as file:
- return json.load(file)
- def split_dataframe_to_dict(df, chunk_size=100):
- # 计算需要切割的份数
- num_chunks = math.ceil(len(df) / chunk_size)
-
- # 用于存储结果的字典
- result_dict = {}
-
- for i in range(num_chunks):
- # 切割 DataFrame
- start = i * chunk_size
- end = min((i + 1) * chunk_size, len(df))
- chunk = df.iloc[start:end]
-
- # 将切割后的 DataFrame 转换为字典并存储
- result_dict[f'chunk_{i+1}'] = chunk.to_dict(orient='records')
-
- return result_dict
- def extract_list_from_string(input_string):
- # 使用正则表达式查找列表部分
- list_pattern = r'\[.*?\]'
- match = re.search(list_pattern, input_string, re.DOTALL)
-
- if match:
- list_string = match.group()
- try:
- # 使用 ast.literal_eval 安全地解析字符串
- result = ast.literal_eval(list_string)
-
- # 检查结果是否为列表
- if isinstance(result, list):
- return result
- else:
- print("解析结果不是列表")
- return None
- except Exception as e:
- print(f"解析错误: {e}")
- return None
- else:
- print("未找到列表结构")
- return None
- def post_openai(messages):
- Baseurl = "https://fast.bemore.lol"
- Skey = "sk-dxl4rt2wWswbdrCr1c7b8500B68c43F5B6175b90F7D672C4"
- payload = json.dumps({
- "model": "gpt-4",
- "messages": messages
- })
- url = Baseurl + "/v1/chat/completions"
- headers = {
- 'Accept': 'application/json',
- 'Authorization': f'Bearer {Skey}',
- 'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
- 'Content-Type': 'application/json'
- }
- response = requests.request("POST", url, headers=headers, data=payload)
- # 解析 JSON 数据为 Python 字典
- print(response)
- data = response.json()
- # 获取 content 字段的值
- content = data['choices'][0]['message']['content']
- return content
- # 示例使用
- # 假设你有一个名为 'your_dataframe' 的 DataFrame
- if not next_iter:
- df = pd.read_excel('会员列表.xlsx')
- df['会员分类'] = ''
- deal_result = split_dataframe_to_dict(df)
- save_dict_to_json(deal_result,'save.json')
- df_all = pd.DataFrame(columns=['会员姓名', '会员分类'])
- df_all.to_csv('会员列表.csv',mode='a', header=True, index=False)
- else:
- deal_result = load_dict_from_json('save.json')
- keys_to_remove = [f'chunk_{i+1}' for i in range(43)]
- # keys_to_remove.remove('chunk_17')
- deal_result = {k: v for k, v in deal_result.items() if k not in keys_to_remove}
- save_dict_to_json(deal_result,'save_new.json')
- print(deal_result.keys())
- for name, value in deal_result.items():
- print(name)
- message = [
- {'role':'system', 'content': '你是一个名字判断专家,你需要根据提供的列表中的每一个字典元素的会员姓名,判断其是否为华裔的名字,结果为 华裔 或者为 非华裔,并将结果填充到会员分类中, 整合之后返回与提供数据一样的格式给我'},
- {'role':'user', 'content':prompt.format(chunk=str(value))}
- ]
- if not proxy:
- response = client.chat.completions.create(model='gpt-4',messages=message)
- result_string = response.choices[0].message.content
- else:
- result_string = post_openai(message)
- result = extract_list_from_string(result_string)
- if result:
- df = pd.DataFrame(result)
- df.to_csv('会员列表.csv',mode='a', header=False, index=False)
- # df_all = pd.concat([df_all, df], ignore_index=True)
- print(df.info())
- print(f'{name}处理完成, 请验收!')
- else:
- print(f'{name}出现问题啦, 请自行调试')
- # 数据清洗模块
- df_origin = pd.read_excel('会员列表.xlsx')
- df_result = pd.read_csv('会员列表.csv')
- df_result['会员分类'] = df_result['会员分类'].apply(lambda x : str(x).replace('Non-Chinese', '非华裔').replace('non-Chinese', '非华裔').replace('Chinese','华裔').replace('可能是华裔','华裔')\
- .replace('可能华裔','华裔').replace('非华人','华裔').replace('华人','非华裔').replace('是','华裔').replace('否','非华裔'))
- df_origin_use = df_origin[['会员姓名']]
- df_result.to_excel('加分类的原始表.xlsx')
- print('*******************************')
- df_origin_use = df_origin_use.drop_duplicates(subset=['会员姓名'])
- print('*******************************')
- df_final = df_origin_use.merge(df_result, on='会员姓名', how='left')
- df_final = df_final.drop_duplicates(subset=['会员姓名'])
- df_final.to_excel('加分类并对原表对应.xlsx')
|