formatData.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. #查找当前文件所有json文件 输出到一个文件夹每个会话对象换行不在一个集合里面
  2. # import json
  3. # import os
  4. #
  5. # def convert_json_files(input_folder):
  6. # all_conversations = []
  7. # for root, dirs, files in os.walk(input_folder):
  8. # for file_name in files:
  9. # if file_name.endswith('.json'):
  10. # input_file = os.path.join(root, file_name)
  11. # with open(input_file, 'r', encoding='utf-8') as f_in:
  12. # for line in f_in:
  13. # if not line.strip():
  14. # continue
  15. # try:
  16. # conversations = json.loads(line)
  17. # conversation_obj = {"conversations": []}
  18. # for i in range(0, len(conversations), 2):
  19. # if i == 0:
  20. # conversation_obj["conversations"].append({"role": "system", "content": "你是污水处理厂技术专家,针对用户提出的问题,提供专业见解和建议并且清晰有条理,解决用户提出的问题"})
  21. # conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
  22. # conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
  23. # all_conversations.append(conversation_obj)
  24. # except json.JSONDecodeError:
  25. # print("Error: Invalid JSON format in file:", input_file)
  26. # continue
  27. # return all_conversations
  28. #
  29. # #指定输入文件夹
  30. # input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
  31. #
  32. # all_conversations = convert_json_files(input_folder)
  33. #
  34. # # # 指定输出文件
  35. # output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/2024.4.09.json'
  36. #
  37. # # 将所有对话写入输出文件
  38. # with open(output_file, 'w', encoding='utf-8') as f_out:
  39. # for conversation_obj in all_conversations:
  40. # json.dump(conversation_obj, f_out, ensure_ascii=False)
  41. # f_out.write('\n')
  42. #这个可以查找文件夹里面所有的json文件并过滤
  43. # import json
  44. # import os
  45. #
  46. #
  47. # def convert_json_files(input_folder):
  48. # all_conversations = []
  49. # special_lines = []
  50. # id = 0
  51. # for root, dirs, files in os.walk(input_folder):
  52. # for file_name in files:
  53. # if file_name.endswith('.json'):
  54. # input_file = os.path.join(root, file_name)
  55. # with open(input_file, 'r', encoding='utf-8') as f_in:
  56. # for line in f_in:
  57. # if "如图所示:" in line:
  58. # special_lines.append(line)
  59. # continue
  60. # if not line.strip():
  61. # continue
  62. # try:
  63. # conversations = json.loads(line)
  64. # id += 1
  65. # conversation_obj = {"id":"identity_" + str(id),"conversations": []}
  66. #
  67. # for i in range(0, len(conversations), 2):
  68. # # if i == 0:
  69. # # conversation_obj["conversations"].append({"role": "system","content": ""})
  70. # conversation_obj["conversations"].append(
  71. # {"from": "user", "value": conversations[i].lstrip("问:")})
  72. # conversation_obj["conversations"].append(
  73. # {"from": "assistant", "value": conversations[i + 1].lstrip("答:")})
  74. # all_conversations.append(conversation_obj)
  75. # except json.JSONDecodeError:
  76. # print("Error: Invalid JSON format in file:", input_file)
  77. # continue
  78. #
  79. # # Write special lines to a separate JSON file
  80. # if special_lines:
  81. # special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json' # Define the path for the special file
  82. # with open(special_file, 'w', encoding='utf-8') as special_out:
  83. # for line in special_lines:
  84. # special_out.write(line)
  85. #
  86. # return all_conversations
  87. #
  88. #
  89. # # 指定输入文件夹
  90. # input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
  91. #
  92. # all_conversations = convert_json_files(input_folder)
  93. #
  94. # # 指定输出文件
  95. # output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/hongshanData_water.json'
  96. #
  97. # # 将所有对话写入输出文件
  98. #
  99. # with open(output_file, 'w', encoding='utf-8') as f_out:
  100. # json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
  101. # 输出到一个集合里面 并且格式化
  102. # import json
  103. # import os
  104. # def convert_json_files(input_folder):
  105. # all_conversations = []
  106. # for file_name in os.listdir(input_folder):
  107. # if file_name.endswith('.json'):
  108. # input_file = os.path.join(input_folder, file_name)
  109. # with open(input_file, 'r', encoding='utf-8') as f_in:
  110. # for line in f_in:
  111. # try:
  112. # conversations = json.loads(line)
  113. # conversation_obj = {"conversations": []}
  114. # for i in range(0, len(conversations), 2):
  115. # if i == 0:
  116. # conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家,你要回答用户询问的问题."})
  117. # conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
  118. # conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
  119. # all_conversations.append(conversation_obj)
  120. # except json.JSONDecodeError:
  121. # print("Error: Invalid JSON format in file:", input_file)
  122. # continue
  123. # return all_conversations
  124. # # 指定输入文件夹
  125. # input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'
  126. # all_conversations = convert_json_files(input_folder)
  127. # # 指定输出文件
  128. # output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'
  129. # # 将所有对话写入输出文件并格式化
  130. # with open(output_file, 'w', encoding='utf-8') as f_out:
  131. # json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
  132. # 输出的内容是数组并且每个对象会添加,以后换行
  133. # import json
  134. # import os
  135. # def convert_json_files(input_folder):
  136. # all_conversations = []
  137. # for file_name in os.listdir(input_folder):
  138. # if file_name.endswith('.json'):
  139. # input_file = os.path.join(input_folder, file_name)
  140. # with open(input_file, 'r', encoding='utf-8') as f_in:
  141. # for line in f_in:
  142. # try:
  143. # conversations = json.loads(line)
  144. # conversation_obj = {"conversations": []}
  145. # for i in range(0, len(conversations), 2):
  146. # if i == 0:
  147. # conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家,你要回答用户询问的问题."})
  148. # conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
  149. # conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
  150. # all_conversations.append(conversation_obj)
  151. # except json.JSONDecodeError:
  152. # print("Error: Invalid JSON format in file:", input_file)
  153. # continue
  154. # return all_conversations
  155. # # 指定输入文件夹
  156. # input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'
  157. # all_conversations = convert_json_files(input_folder)
  158. # # 指定输出文件
  159. # output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'
  160. # # 将所有对话写入输出文件并格式化
  161. # with open(output_file, 'w', encoding='utf-8') as f_out:
  162. # f_out.write("[\n") # 开始列表
  163. # for i, conversation_obj in enumerate(all_conversations):
  164. # json.dump(conversation_obj, f_out, ensure_ascii=False)
  165. # # f_out.write('\n') # 每个对话对象后添加换行符
  166. # if i < len(all_conversations) - 1:
  167. # f_out.write(",\n") # 在除了最后一个对话对象之后添加逗号和换行符
  168. # f_out.write("]\n") # 结束列表
  169. # qwen训练样本
  170. import json
  171. import os
  172. import random
  173. import re
  174. # 替换图片为mackdown地址
  175. def replace_image_tags(text):
  176. # 匹配包含和不包含 '.' 的情况
  177. pattern = r'@([^@]+?)(\.(jpg|jpeg|png|gif)|jpg|jpeg|png|gif)@\$\s*'
  178. # twoPattern = r'@[A-Z0-9.]+@\$\s*'
  179. # 替换模板,根据是否匹配到 '.' 动态决定是否添加
  180. def replace_template(match):
  181. filename = match.group(1)
  182. extension = match.group(2)
  183. if not extension.startswith('.'):
  184. extension = '.' + extension
  185. return f'![{filename}](https://static.fuxicarbon.com/modelData/{filename}{extension})'
  186. # 使用 re.sub 进行替换
  187. new_text = re.sub(pattern, replace_template, text)
  188. # 使用 re.sub 进行替换
  189. # result = re.sub(twoPattern, '', new_text)
  190. return new_text
  191. def convert_image_format(text):
  192. # 第一个正则表达式
  193. pattern1 = r'@([^@]+?)(\.(jpg|jpeg|png|gif)|jpg|jpeg|png|gif)?@[\$\$]'
  194. # 第二个正则表达式
  195. pattern2 = r'@([^@]+?)(?:\.(jpg|jpeg|png|gif))?@[\$\$]'
  196. # 替换模板函数,根据是否匹配到扩展名动态决定替换内容
  197. def replace_template(match):
  198. filename = match.group(1)
  199. extension = match.group(3) if match.group(3) else match.group(2)
  200. if extension:
  201. # 如果没有以 '.' 开头的扩展名,前面加上 '.'
  202. if not extension.startswith('.'):
  203. extension = '.' + extension
  204. else:
  205. extension = ''
  206. return f'![{filename}](https://static.fuxicarbon.com/modelData/{filename}{extension})'
  207. # 使用第一个正则表达式进行替换
  208. new_text = re.sub(pattern1, replace_template, text)
  209. # 使用第二个正则表达式进行替换
  210. new_text = re.sub(pattern2, replace_template, new_text)
  211. return new_text
  212. def convert_json_files_sharegpt(input_folder):
  213. all_conversations = []
  214. special_lines = []
  215. for root, dirs, files in os.walk(input_folder):
  216. for file_name in files:
  217. if file_name.endswith('.json'):
  218. input_file = os.path.join(root, file_name)
  219. with open(input_file, 'r', encoding='utf-8') as f_in:
  220. for line in f_in:
  221. if "如图所示:" in line:
  222. special_lines.append(line)
  223. continue
  224. if not line.strip():
  225. continue
  226. try:
  227. conversations = json.loads(line)
  228. conversation_obj = {"conversations": []}
  229. for i in range(0, len(conversations), 2):
  230. # if i == 0:
  231. # conversation_obj["conversations"].append({"system": "你是信义污水厂助手。"})
  232. conversation_obj["conversations"].append(
  233. {"from": "human", "value": conversations[i].lstrip("问:")})
  234. conversation_obj["conversations"].append(
  235. {"from": "gpt", "value": conversations[i + 1].lstrip("答:")})
  236. all_conversations.append(conversation_obj)
  237. except json.JSONDecodeError:
  238. print("Error: Invalid JSON format in file:", input_file)
  239. continue
  240. # Write special lines to a separate JSON file
  241. if special_lines:
  242. special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json' # Define the path for the special file
  243. with open(special_file, 'w', encoding='utf-8') as special_out:
  244. for line in special_lines:
  245. special_out.write(line)
  246. return all_conversations
  247. def formatSharegpt ():
  248. # 指定输入文件夹
  249. input_folder = './book/2024.5.13/'
  250. all_conversations = convert_json_files_sharegpt(input_folder)
  251. # 指定输出文件
  252. output_file = './book/shareGpt/2024.5.13/shareGpt.json'
  253. # 确保输出目录存在
  254. output_dir = os.path.dirname(output_file)
  255. os.makedirs(output_dir, exist_ok=True)
  256. # 将所有对话写入输出文件 随机打乱数据
  257. # random.shuffle(all_conversations)
  258. # 将所有对话写入输出文件
  259. with open(output_file, 'w', encoding='utf-8') as f_out:
  260. json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
  261. # with open(output_file, 'w', encoding='utf-8') as f_out:
  262. # for conversation_obj in all_conversations:
  263. # json.dump(conversation_obj, f_out, ensure_ascii=False)
  264. # f_out.write('\n')
  265. def convert_json_files(input_folder):
  266. all_conversations = []
  267. # 获取文件夹名称
  268. folder_name = os.path.basename(os.path.normpath(input_folder))
  269. for root, dirs, files in os.walk(input_folder):
  270. for file_name in files:
  271. #todo 条件表达式扩展名是json并且文件名匹配 good、reg_lines、short才符合条件
  272. if file_name in ["good.json", "reg_lines.json", "short.json"]:
  273. input_file = os.path.join(root, file_name)
  274. with open(input_file, 'r', encoding='utf-8') as f_in:
  275. for line in f_in:
  276. if not line.strip():
  277. continue
  278. try:
  279. conversations = json.loads(line)
  280. conversation_obj = {
  281. "id":f"{folder_name}_{len(all_conversations)}",
  282. "instruction": "",
  283. "input": "",
  284. "output": "",
  285. "history": []
  286. }
  287. for i in range(0, len(conversations), 2):
  288. # if i == 0:
  289. # conversation_obj["conversations"].append({"role": "system","content": ""})
  290. question = conversations[i].lstrip("问:").strip().replace(" ", "").replace(" ", "")
  291. answer = conversations[i + 1].lstrip("答:").strip().replace(" ", "").replace(" ", "")
  292. # match = re.search(r'times:(.*)', question)
  293. # if match:
  294. # conversation_obj["time"] = match.group(1)
  295. # question = re.sub(r'times:.*', '', question)
  296. question = convert_image_format(question)
  297. answer = convert_image_format(answer)
  298. if len(conversations)==2:
  299. conversation_obj["instruction"]=question
  300. conversation_obj["output"]= answer
  301. elif len(conversations)>2:
  302. # todo 如果当前 i小于len(conversations)的最后两个执行下面的操作
  303. if i < len(conversations)-2:
  304. history = []
  305. history.append(question)
  306. history.append(answer)
  307. conversation_obj["history"].append(history)
  308. # todo 如果当前 i等于len(conversations)的最后两个元素执行下面的操作
  309. elif i == len(conversations) - 2: # 倒数第二个对话
  310. conversation_obj["instruction"]=question
  311. conversation_obj["output"]= answer
  312. all_conversations.append(conversation_obj)
  313. except json.JSONDecodeError:
  314. print("Error: Invalid JSON format in file:", input_file)
  315. continue
  316. return all_conversations
  317. # 格式化训练数据为alpace格式
  318. def formatAlpaca (input_folder,output_file):
  319. all_conversations = convert_json_files(input_folder)
  320. print(len(all_conversations))
  321. # 将所有对话写入输出文件 随机打乱数据
  322. # random.shuffle(all_conversations)
  323. with open(output_file, 'w', encoding='utf-8') as f_out:
  324. json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
  325. # 转成不是集合 都是一行一行的json对象
  326. # with open(output_file, 'w', encoding='utf-8') as f_out:
  327. # for conversation_obj in all_conversations:
  328. # json.dump(conversation_obj, f_out, ensure_ascii=False)
  329. # f_out.write('\n')
  330. def formatFlatten(input_folder):
  331. all_conversations = []
  332. for root, dirs, files in os.walk(input_folder):
  333. for file_name in files:
  334. if file_name.endswith('.json'):
  335. input_file = os.path.join(root, file_name)
  336. with open(input_file, 'r', encoding='utf-8') as f_in:
  337. for line in f_in:
  338. if not line.strip():
  339. continue
  340. try:
  341. conversations = json.loads(line)
  342. for i in range(0, len(conversations), 2):
  343. if "信义污水厂" not in conversations[i]:
  344. conversations[i] = conversations[i].replace("问:", "问:信义污水厂的")
  345. all_conversations.append([conversations[i],conversations[i + 1]])
  346. except json.JSONDecodeError:
  347. print("Error: Invalid JSON format in file:", input_file)
  348. continue
  349. # Write special lines to a separate JSON file
  350. if all_conversations:
  351. special_file = './excel/huaxiang/huaxiang_2024.5.21.json' # Define the path for the special file
  352. with open(special_file, 'w', encoding='utf-8') as f_out:
  353. for conversation_obj in all_conversations:
  354. json.dump(conversation_obj, f_out, ensure_ascii=False)
  355. f_out.write('\n')
  356. if __name__ == "__main__":
  357. input_folder = './book/jiejin/'
  358. output_file = './book/alpace/jiejin.json'
  359. formatAlpaca(input_folder,output_file)