ai-model
/
dataTools


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
							import os
import xlrd
import json

from collections import defaultdict

def parse_excel_files(directory,only_ask):
    result = []
    # 遍历指定目录下的所有文件和文件夹
    for root, dirs, files in os.walk(directory):
        # 获取文件夹名称
        folder_name = os.path.basename(os.path.normpath(root))
        for file in files:
            # 仅处理扩展名为 .xlsx 或 .xls 的文件
            if file.endswith('.xlsx') or file.endswith('.xls'):
                file_path = os.path.join(root, file)
                
                # 使用 xlrd 打开 Excel 文件
                workbook = xlrd.open_workbook(file_path)
                # 遍历每个 sheet
                for sheet in workbook.sheets():
                    rows = sheet.nrows
                    for row in range(rows):
                        row_data = []
                        # 跳过表头
                        if row == 0:
                            continue
                        # 跳过空行
                        if all(sheet.cell_value(row, col) == '' for col in range(sheet.ncols)):
                            continue
                        # 遍历每一列
                        for col in range(sheet.ncols):
                            # 获取表头
                            header = sheet.cell_value(0, col)
                            # 获取单元格值
                            cell_value = sheet.cell_value(row, col)
                            if cell_value:
                                if header=='问':
                                    if only_ask is None:
                                        cell_value= '问：'+str(cell_value)
                                    else:
                                        cell_value= str(cell_value).strip()
                                    row_data.append(cell_value)
                                if header=='答' and only_ask==None:
                                    cell_value= '答：'+str(cell_value)
                                    row_data.append(cell_value)
                        # 将有效行数据添加到结果中
                        if row_data:
                            result.append(row_data)
    print(len(result))
    return result

def write_data_to_file(input_directory,output_file,only_ask=None):
    
    # 解析 Excel 文件并写入数据到文件
    excel_data = parse_excel_files(input_directory,only_ask)
    with open(output_file, 'w', encoding='utf-8') as f:
        # 遍历每行内容
        for row in excel_data:
            # 使用json.dumps将行数据转换为JSON格式的字符串
            json_row = json.dumps(row, ensure_ascii=False)
            # 写入文件并添加换行符
            f.write(json_row + '\n')

def formatTjl(directory):
    result = []
   
    # 遍历指定目录下的所有文件和文件夹
    for root, dirs, files in os.walk(directory):
        for file in files:
            # 处理扩展名为 .xlsx 和 .xls 的文件
            if file.endswith('.xlsx') or file.endswith('.xls'):
                file_path = os.path.join(root, file)
                
                
                if file.endswith('.xls'):
                    # 使用 xlrd 打开 Excel 文件
                    workbook = xlrd.open_workbook(file_path)
                    for sheet in workbook.sheets():
                        print(sheet.name)
                        for row_idx in range(sheet.nrows):
                            row_data = {
                                'data': '',
                                'time': '',
                                'CLJYSSLL': '',
                                'SJTYJLY': '',
                            }
                            # 跳过表头
                            if row_idx < 4:
                                continue
                            # 跳过空行
                            if all(sheet.cell_value(row_idx, col_idx) == '' for col_idx in range(sheet.ncols)):
                                continue
                            # 遍历每一列
                            for col_idx in range(sheet.ncols):
                                # 获取表头
                                header = sheet.cell_value(3, col_idx)
                                
                                cell_value = sheet.cell_value(row_idx, col_idx)
                                if cell_value:
                                    if header == '日期':
                                        row_data['data'] = str(cell_value)
                                    elif header == '时间':
                                        row_data['time'] = str(cell_value)
                                    elif header == '除磷加药瞬时流量':
                                        row_data['CLJYSSLL'] = str(cell_value)
                                    elif header == '乙酸钠加药瞬时流量转换':
                                        row_data['SJTYJLY'] = str(cell_value)
                            # 将有效行数据添加到结果中
                            result.append(row_data)
                            
     # 根据 data 和 time 进行排序
    def sort_key(row):
        return (row['data'])
    
    result.sort(key=sort_key)
    # print(len(result))
    output_file = './excel/ceshi.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        for row in result:
            json_row = json.dumps(row, ensure_ascii=False)
            f.write(json_row + '\n')
            
    
if __name__ == "__main__": 
    # 指定目录路径和输出文件路径
    input_directory = '/Users/yushanghui/hongshantianping/git/dataTools/book/edit/'
    output_file = '/Users/yushanghui/hongshantianping/git/dataTools/book/json/edit.json'

    write_data_to_file(input_directory,output_file)