util.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. import json
  2. import csv
  3. import yaml
  4. from datetime import datetime
  5. from config import db_list_2
  6. import pandas as pd
  7. import re
  8. from pyecharts.charts import Page, Grid, Bar, Line, Scatter, Pie
  9. from pyecharts import options as opts
  10. from pyecharts.globals import ThemeType
  11. import pandas as pd
  12. import numpy as np
  13. from autogen.code_utils import CODE_BLOCK_PATTERN
  14. # 将JSON字符串转换为Excel文件
  15. def json_to_excel(jsons, excel_name):
  16. data = json.loads(jsons)
  17. df = pd.DataFrame(data)
  18. with pd.ExcelWriter(excel_name, engine='xlsxwriter') as writer:
  19. df.to_excel(writer, sheet_name='Sheet1', index=False)
  20. # 将JSON字符串转换为DataFrame对象
  21. def json_to_dataframe(jsons):
  22. # 解析 JSON 字符串
  23. data = json.loads(jsons)
  24. # 将数据转换为 DataFrame
  25. df = pd.DataFrame(data)
  26. return df
  27. # 将JSON字符串转换为CSV文件
  28. def json_to_csv(jsons, csv_name):
  29. data = json.loads(jsons)
  30. fieldnames = list(data[0].keys())
  31. with open(csv_name, 'w', newline='') as file:
  32. writer = csv.DictWriter(file, fieldnames=fieldnames)
  33. # 写入列名
  34. writer.writeheader()
  35. # 写入数据
  36. writer.writerows(data)
  37. # 获取当前时间戳,并格式化为HH_MM_SS
  38. def get_timestamp():
  39. now = datetime.now()
  40. hours = now.hour
  41. minutes = now.minute
  42. seconds = now.second
  43. short_time_mm_ss = f"{hours:02}_{minutes:02}_{seconds:02}"
  44. return short_time_mm_ss
  45. # 根据数据库名称获取数据库参数
  46. def get_db_param(db_name):
  47. for db in db_list_2:
  48. print(db)
  49. if db['database'] == db_name:
  50. return db
  51. return None
  52. # 将内容写入指定文件
  53. def write_file(fname, content):
  54. with open(fname, "w") as f:
  55. f.write(content)
  56. # 将JSON字符串写入指定文件
  57. def write_json_file(fname, json_str: str):
  58. # convert ' to "
  59. json_str = json_str.replace("'", '"')
  60. # Convert the string to a Python object
  61. data = json.loads(json_str)
  62. # Write the Python object to the file as JSON
  63. with open(fname, "w") as f:
  64. json.dump(data, f, indent=4)
  65. # 将JSON字符串转化为YAML格式并写入文件
  66. def write_yml_file(fname, json_str: str):
  67. # Try to replace single quotes with double quotes for JSON
  68. cleaned_json_str = json_str.replace("'", '"')
  69. # Safely convert the JSON string to a Python object
  70. try:
  71. data = json.loads(cleaned_json_str)
  72. except json.JSONDecodeError as e:
  73. print(f"Error decoding JSON: {e}")
  74. return
  75. # Write the Python object to the file as YAML
  76. with open(fname, "w") as f:
  77. yaml.dump(data, f)
  78. # 从消息中提取代码块,返回代码块列表
  79. def extract_code_blocks(message):
  80. """(Experimental) Extract code blocks from a message. If no code blocks are found,
  81. return an empty list.
  82. Args:
  83. message (str): The message to extract code blocks from.
  84. Returns:
  85. List[CodeBlock]: The extracted code blocks or an empty list.
  86. """
  87. if message == None:
  88. return
  89. text = message
  90. match = re.findall(CODE_BLOCK_PATTERN, text, flags=re.DOTALL)
  91. if not match:
  92. return []
  93. code_blocks = []
  94. for lang, code in match:
  95. if lang == 'python' or lang == 'py':
  96. code_blocks.append(code)
  97. return code_blocks
  98. # 从消息中提取任务(正则查找#开头和结尾的内容),返回任务列表
  99. def get_task(text):
  100. if isinstance(text, dict):
  101. text = text.get('content')
  102. result = []
  103. pattern = r'#(.*?)#'
  104. matches = re.findall(pattern, text)
  105. for i, match in enumerate(matches, 1):
  106. print(f"{match}")
  107. result.append(match)
  108. return result
  109. # 自动绘制数据可视化报告
  110. class plot_data:
  111. def __init__(self):
  112. pass
  113. # 自动绘制图表并将其保存未HTML文件
  114. def auto_plot(self, df, filename):
  115. """
  116. Automatically generate plots for numeric and categorical columns in a DataFrame using pyecharts.
  117. Args:
  118. df (pandas.DataFrame): DataFrame containing data to plot.
  119. Returns:
  120. pyecharts.charts.Page: A Page object containing all generated charts.
  121. """
  122. time_index = pd.api.types.is_datetime64_any_dtype(df.index)
  123. numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
  124. category_cols = df.select_dtypes(include=['object']).columns.tolist()
  125. time_cols = df.select_dtypes(include=[np.datetime64]).columns.tolist()
  126. # 创建Page对象
  127. page = Page(layout=Page.SimplePageLayout)
  128. page.page_title = "Data Visualization Report"
  129. # 通用的初始化选项,设置宽度为50%
  130. common_init_opts = opts.InitOpts(
  131. theme=ThemeType.LIGHT,
  132. width= "1900px",
  133. height= "1200px",
  134. )
  135. # 通用的全局选项
  136. # common_global_opts = {
  137. # "toolbox_opts": opts.ToolboxOpts(),
  138. # "datazoom_opts": [opts.DataZoomOpts()],
  139. # }
  140. common_global_opts = {
  141. # "legend_opts": opts.LegendOpts(pos_top="10%"),
  142. "toolbox_opts": opts.ToolboxOpts(pos_top="5%"),
  143. # "datazoom_opts": [opts.DataZoomOpts()],
  144. }
  145. charts = []
  146. # Distribution of Numeric Columns
  147. if len(numeric_cols) > 0:
  148. bar = (
  149. Bar(init_opts=common_init_opts)
  150. .add_xaxis(numeric_cols)
  151. .add_yaxis("Mean", df[numeric_cols].mean().tolist(), label_opts=opts.LabelOpts(position="top"))
  152. .add_yaxis("Median", df[numeric_cols].median().tolist(), label_opts=opts.LabelOpts(position="top"))
  153. .set_global_opts(
  154. title_opts=opts.TitleOpts(title="Distribution of Numeric Columns", padding=[0, 0, 20, 0], pos_top="5%"),
  155. xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45)),
  156. legend_opts=opts.LegendOpts(pos_top="10%"),
  157. **common_global_opts
  158. )
  159. )
  160. charts.append(bar)
  161. # Scatter Plots for Numeric Columns
  162. if len(numeric_cols) > 1:
  163. for i in range(len(numeric_cols)-1):
  164. for j in range(i+1, len(numeric_cols)):
  165. scatter = (
  166. Scatter(init_opts=common_init_opts)
  167. .add_xaxis(df[numeric_cols[i]].tolist())
  168. .add_yaxis(
  169. numeric_cols[j],
  170. df[[numeric_cols[i], numeric_cols[j]]].values.tolist(),
  171. label_opts=opts.LabelOpts(is_show=False),
  172. )
  173. .set_global_opts(
  174. title_opts=opts.TitleOpts(title=f"{numeric_cols[i]} vs {numeric_cols[j]} Scatter Plot", padding=[0, 0, 20, 0], pos_top="5%"),
  175. xaxis_opts=opts.AxisOpts(type_="value", name=numeric_cols[i]),
  176. yaxis_opts=opts.AxisOpts(type_="value", name=numeric_cols[j]),
  177. legend_opts=opts.LegendOpts(pos_top="10%"),
  178. **common_global_opts
  179. )
  180. )
  181. charts.append(scatter)
  182. # Bar Charts for Categorical vs Numeric Columns
  183. if len(category_cols) > 0 and len(numeric_cols) > 0:
  184. for cat_col in category_cols:
  185. for num_col in numeric_cols:
  186. bar = (
  187. Bar(init_opts=common_init_opts)
  188. .add_xaxis(df[cat_col].unique().tolist())
  189. .add_yaxis(num_col, df.groupby(cat_col)[num_col].mean().tolist(), label_opts=opts.LabelOpts(position="top"))
  190. .set_global_opts(
  191. title_opts=opts.TitleOpts(title=f"{num_col} Distribution by {cat_col}", padding=[0, 0, 20, 0], pos_top="5%"),
  192. xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45)),
  193. legend_opts=opts.LegendOpts(pos_top="10%"),
  194. **common_global_opts
  195. )
  196. )
  197. charts.append(bar)
  198. # Time Series Plots
  199. if time_index or len(time_cols) > 0:
  200. if time_index:
  201. time_data = df.index
  202. elif len(time_cols) > 0:
  203. time_data = df[time_cols[0]]
  204. line = Line(init_opts=common_init_opts)
  205. line.add_xaxis(time_data.astype(str).tolist())
  206. for num_col in numeric_cols:
  207. line.add_yaxis(num_col, df[num_col].tolist(), is_smooth=True)
  208. line.set_global_opts(
  209. title_opts=opts.TitleOpts(title=f"{', '.join(numeric_cols)} Trends Over Time", padding=[0, 0, 20, 0], pos_top="5%"),
  210. xaxis_opts=opts.AxisOpts(type_="category", name="Time"),
  211. yaxis_opts=opts.AxisOpts(type_="value", name="Values"),
  212. legend_opts=opts.LegendOpts(pos_top="10%"),
  213. **common_global_opts
  214. )
  215. charts.append(line)
  216. # Pie Chart for Numeric Columns
  217. if len(numeric_cols) > 1:
  218. sum_values = df[numeric_cols].sum()
  219. pie = (
  220. Pie(init_opts=common_init_opts)
  221. .add(
  222. series_name="Sum Distribution",
  223. data_pair=list(zip(numeric_cols, sum_values.tolist())),
  224. radius=["40%", "75%"],
  225. )
  226. .set_global_opts(
  227. title_opts=opts.TitleOpts(title="Sum Distribution of Numeric Columns", padding=[0, 0, 20, 0], pos_top="5%"),
  228. legend_opts=opts.LegendOpts(pos_left="15%", pos_top="15%"),
  229. **common_global_opts
  230. )
  231. .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
  232. )
  233. charts.append(pie)
  234. for i in range(0, len(charts)):
  235. grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.ROMA, width='700px',height='600px'))
  236. grid.add(charts[i], grid_opts=opts.GridOpts(pos_left="100px", pos_right="200px", pos_top="100px", pos_bottom='100px'))
  237. page.add(grid)
  238. page.render(filename)
  239. return f'data have saved in {filename}'
  240. if __name__ == '__main__':
  241. import pandas as pd
  242. import numpy as np
  243. # 创建一个示例 DataFrame
  244. data = {
  245. '类目': ['A', 'B', 'A', 'B', 'A', 'B'],
  246. '年龄': [10, 20, 30, 40, 50, 60],
  247. '体重': [5, 15, 25, 35, 45, 55],
  248. '日期': pd.date_range(start='2023-01-01', periods=6)
  249. }
  250. df = pd.DataFrame(data)
  251. df.set_index('日期', inplace=True) # 将日期列设置为索引
  252. # 创建一个 plot_data 实例
  253. plotter = plot_data()
  254. # 使用 auto_plot 方法绘制数据可视化报告
  255. result = plotter.auto_plot(df, 'data_visualization.html')
  256. print(result)