ai-model
/
dataTools


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
							import os
import json
import docx

def extract_text_from_docx(docx_file):
    try:
        doc = docx.Document(docx_file)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {docx_file}: {e}")
        return ""

def extract_title_and_content(text):
    lines = text.split('\n')
    title = ""
    content = ""
    for line in lines:
        if line.strip():
            if not title:
                title = line.strip()
            else:
                content += line.strip() + "\n"
    return title, content.strip()

def parse_documents(folder_path):
    parsed_data = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith('.docx') or file_name.endswith('.doc'):
            text = extract_text_from_docx(file_path)
            if text:
                title, content = extract_title_and_content(text)
                if not title:
                    title = os.path.splitext(file_name)[0]
                parsed_data.append({"问": title, "答": content})
        else:
            continue
        
    return parsed_data

def save_to_json(data, json_file):
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

# 指定文件夹路径
folder_path = '/Users/yushanghui/Downloads/slibra-img/环保督察类/进水水质超标/'

# 解析文件夹中的文档
parsed_data = parse_documents(folder_path)

# 保存到JSON文件
json_file = '/Users/yushanghui/Downloads/slibra-img/环保督察类/进水水质超标/parsed_documents.json'
save_to_json(parsed_data, json_file)

print("解析完成，并已保存到", json_file)