Python文件处理（1）-电子病例报告PDF信息提取

Python文件处理（1）-电子病例报告PDF信息提取

python,文件处理

|

2024-11-05 20:45

所用包

import os
import re
import pandas as pd
from PyPDF2 import PdfReader
from pathlib import Path
import openpyxl

STEP1

pdf信息转text文本

filepath="青春末期随访全pdf/xxxx.pdf"
reader = PdfReader(filepath)
    text = ""
for page in reader.pages:
    text += page.extract_text()

STEP2

测试并优化正则表达式

re.search(r'直接胆红素.*?DBIL.*?\s*([\d\.]+)', text)
eosinophil_count = re.search(r'淋巴细胞.*?\(LY\)\s*([\d\.]+)', text)
urine_blood = re.search(r'尿液分析[\s\S]*?隐血.*?([\d\+-]+)', text)

STEP3

构建单病例报告提取函数

def extract_data_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""

    # 合并所有页面的文本
    for page in reader.pages:
        text += page.extract_text()

    #基本信息
    name =re.search(r'姓\s+名: (\w+)',text)
    date =re.search(r'体检日期\s*([\d-]+)',text)
    phone =re.search(r'电  话\s*(\d+)',text)
    idcard =re.search(r'证件号\s*(\d+)',text)

    # 一般检查
    height = re.search(r'([\d\.]+)\s*cm\s*身高',text)
    weight = re.search(r'([\d\.]+)\s*Kg\s*体重', text)
    waist_circumference = re.search(r'([\d\.]+)\s*cm\s*腰围', text)
    hip_circumference = re.search(r'([\d\.]+)\s*cm\s*臀围', text)
    systolic_bp = re.search(r'([\d\.]+)\s*mmHg\s*收缩压', text)
    systolic_bp2 = re.search(r'([\d\.]+)\s*mmHg\s*收缩压第二次测量', text)
    diastolic_bp = re.search(r'([\d\.]+)\s*mmHg\s*舒张压', text)
    diastolic_bp2 = re.search(r'([\d\.]+)\s*mmHg\s*舒张压第二次测量', text)
    heart_rate = re.search(r'([\d\.]+)\s*脉搏', text)
    heart_rate2 = re.search(r'([\d\.]+)\s*心率第二次测量', text)
    bmi = re.search(r'(\d{2}\.?\d?)\s体重指数', text)

    #体成分
    body_weight = re.search(r'体重（体成分仪）\s*([\d\.]+)\s*kg', text)
    fat_mass = re.search(r'脂肪量\s*([\d\.]+)\s*kg', text)
    body_fat_rate = re.search(r'体脂率\s*([\d\.]+)\s*%', text)
    fat_free_weight = re.search(r'去脂体重\s*([\d\.]+)\s*kg', text)
    muscle_mass = re.search(r'肌肉量\s*([\d\.]+)\s*kg', text)
    body_water_weight = re.search(r'体水分比\s*([\d\.]+)\s*%', text)
    bone_mass = re.search(r'骨量\s*([\d\.]+)\s*kg', text)
    basal_metabolism = re.search(r'基础代谢\s*([\d\.]+)\s*kcal', text)
    visceral_fat_level = re.search(r'内脏脂肪等级\s*([\d]+)', text)


    #便检
    stool_color = re.search(r'大便颜色\s*(\w+)', text)
    stool_properties = re.search(r'大便性状\s*(\w+)', text)
    stool_microscopy = re.search(r'大便镜检\s*([^\n]+)', text)
    abnormal_conditions = re.search(r'异常情况描述\s*([^\n]+)', text)

    #尿检
    urine_urobilinogen = re.search(r'尿胆原URO.*([\d\+-]+)', text)
    urine_bilirubin = re.search(r'胆红素BIL.*([\d\+-]+)', text)
    urine_ketones = re.search(r'酮体KET.*([\d\+-]+)', text)
    urine_blood = re.search(r'尿液分析[\s\S]*?隐血.*?([\d\+-]+)', text)
    urine_protein = re.search(r'尿蛋白.*([\d\+-]+)', text)
    urine_nitrite = re.search(r'亚硝酸盐.*?NIT.*?([\d\+-]+)', text)
    urine_leukocytes = re.search(r'尿液分析[\s\S]*?白细胞\s*([\d\+-]+)', text)
    urine_specific_gravity = re.search(r'尿比重SG\s*([\d\.]+)', text)
    urine_ph = re.search(r'酸碱度PH\s*([\d\.]+)', text)
    vc= re.search(r'维生素c.*([\d\+-]+)', text)
    urine_suger = re.search(r'尿糖.*([\d\+-]+)', text)
    # 血液检测
    wbcs = re.search(r'血常规白细胞计数\s*([\d\.]+)', text)
    rbc = re.search(r'血常规红细胞计数\s*([\d\.]+)', text)
    hemoglobin = re.search(r'血红蛋白浓度\s*([\d\.]+)', text)
    hematocrit = re.search(r'红细胞比容\s*([\d\.]+)', text)
    rbc_volume = re.search(r'平均红细胞体积\s*([\d\.]+)', text)
    avg_hb_content = re.search(r'平均RBC血红蛋白含量\s*([\d\.]+)', text)
    avg_hb_concentration = re.search(r'平均RBC血红蛋白浓度\s*([\d\.]+)', text)
    rbc_distribution_width_sd = re.search(r'红细胞体积分布宽度.*?SD\s*([\d\.]+)', text)
    rbc_distribution_width_cv = re.search(r'红细胞体积分布宽度.*?CV\s*([\d\.]+)', text)
    platelets = re.search(r'血小板计数\s*([\d\.]+)', text)
    血小板比容 = re.search(r'血小板比容\s*([\d\.]+)', text)
    platelet_distribution_width =re.search(r'血小板分布宽度\s*([\d\.]+)', text)
    platelet_volume = re.search(r'平均血小板体积\s*([\d\.]+)', text)
    bigplatelet = re.search(r'大血小板比例\s*([\d\.]+)', text)


    # 血液细胞计数
    neutrophil_count = re.search(r'中性粒细胞计数\s*([\d\.]+)', text)
    lymphocyte_count = re.search(r'淋巴细胞.*?\(LY\)\s*([\d\.]+)', text)
    monocyte_count = re.search(r'单核细胞计数\s*([\d\.]+)', text)
    eosinophil_count = re.search(r'嗜酸细胞计数\s*([\d\.]+)', text)
    basophil_count = re.search(r'嗜碱细胞计数\s*([\d\.]+)', text)
    neutrophil_percentage = re.search(r'中性粒细胞百分比\s*([\d\.]+)', text)
    lymphocyte_percentage = re.search(r'淋巴细胞百分比\s*([\d\.]+)', text)
    monocyte_percentage = re.search(r'单核细胞百分比\s*([\d\.]+)', text)
    eosinophil_percentage = re.search(r'嗜酸粒细胞百分比\s*([\d\.]+)', text)
    basophil_percentage = re.search(r'嗜碱粒细胞百分比\s*([\d\.]+)', text)
    未成熟粒细胞绝对值 = re.search(r'未成熟粒细胞绝对值\s*([\d\.]+)', text)
    未成熟粒细胞比值 = re.search(r'未成熟粒细胞比值\s*([\d\.]+)', text)

    # 糖化血红蛋白和胆红素
    glycosylated_hemoglobin = re.search(r'糖化血红蛋白.*?\s*([\d\.]+)', text)
    total_bilirubin = re.search(r'\(TBIL\)\s*([\d\.]+)', text)
    direct_bilirubin = re.search(r'直接胆红素.*?DBIL.*?\s*([\d\.]+)', text)
    indirect_bilirubin = re.search(r'间接胆红素.*?IBIL.*?\s*([\d\.]+)', text)
    total_protein = re.search(r'总蛋白\s*\(([\d\.]+) g/L\)', text)
    albumin = re.search(r'白蛋白\s*\(([\d\.]+) g/L\)', text)
    globulin = re.search(r'球蛋白\s*\(([\d\.]+) g/L\)', text)
    albumin_globulin_ratio = re.search(r'白球比\s*([\d\.]+)', text)

    # 其他血液生化指标
    alt = re.search(r'谷丙转氨酶.*?\s*([\d\.]+)', text)
    urea = re.search(r'尿素\(Brea\)\s*([\d\.]+)', text)
    creatinine = re.search(r'肌酐\(Cr\)\s*([\d\.]+)', text)
    uric_acid = re.search(r'尿酸\(UA\)\s*([\d\.]+)', text)
    co2 = re.search(r'二氧化碳\s*([\d\.]+)', text)
    cystatin_c = re.search(r'胱抑素\s*([\d\.]+)', text)
    b2_microglobulin = re.search(r'B2微球蛋白\s*([\d\.]+)', text)

    # 血脂四项
    total_cholesterol = re.search(r'总胆固醇\s*([\d\.]+)', text)
    triglycerides = re.search(r'甘油三酯\s*([\d\.]+)', text)
    hdl = re.search(r'高密度脂蛋白\s*([\d\.]+)', text)
    ldl = re.search(r'低密度脂蛋白\s*([\d\.]+)', text)

    # 动脉硬化指数和葡萄糖
    arteriosclerosis_index = re.search(r'动脉硬化指数\s*([\d\.]+)', text)
    glucose = re.search(r'葡萄糖\s*\(([\d\.]+) mmol/L\)', text)

    # 谷草转氨酶
    ast = re.search(r'谷草转氨酶.*?\s*([\d\.]+)', text)

    # b超诊断结果
    Bresult = re.search(r'2\.肝胆胰脾双肾彩色 B超.*?:(.+)\s', text)




    # 将提取的数据转为字典格式
    data = {
        #基本信息
        "姓名": str(name.group(1)) if name else None,
        "身份证": str(idcard.group(1)) if idcard else None,
        "检查日期": str(date.group(1)) if date else None,
        "手机号": str(phone.group(1)) if phone else None,
        #一般检查
        "身高": float(height.group(1)) if height else None,
        "体重": float(weight.group(1)) if weight else None,
        "腰围": float(waist_circumference.group(1)) if waist_circumference else None,
        "臀围": float(hip_circumference.group(1)) if hip_circumference else None,
        "收缩压": float(systolic_bp.group(1)) if systolic_bp else None,
        "收缩压2": float(systolic_bp2.group(1)) if systolic_bp2 else None,
        "舒张压": float(diastolic_bp.group(1)) if diastolic_bp else None,
        "舒张压2": float(diastolic_bp2.group(1)) if diastolic_bp2 else None,
        "心率": float(heart_rate.group(1)) if heart_rate else None,
        "心率2": float(heart_rate2.group(1)) if heart_rate2 else None,
        #体成分
        "体重（体成分仪）": float(body_weight.group(1)) if body_weight else None,
        "脂肪量": float(fat_mass.group(1)) if fat_mass else None,
        "体脂率": float(body_fat_rate.group(1)) if body_fat_rate else None,
        "去脂体重": float(fat_free_weight.group(1)) if fat_free_weight else None,
        "肌肉量": float(muscle_mass.group(1)) if muscle_mass else None,
        "体水分比": float(body_water_weight.group(1)) if body_water_weight else None,
        "骨量": float(bone_mass.group(1)) if bone_mass else None,
        "基础代谢": float(basal_metabolism.group(1)) if basal_metabolism else None,
        "内脏脂肪等级": int(visceral_fat_level.group(1)) if visceral_fat_level else None,
        "BMI": float(bmi.group(1)) if bmi else None,
        #便检
        "大便颜色": stool_color.group(1).strip() if stool_color else None,
        "大便性状": stool_properties.group(1).strip() if stool_properties else None,
        "大便镜检": stool_microscopy.group(1).strip() if stool_microscopy else None,
        "异常情况描述": abnormal_conditions.group(1).strip() if abnormal_conditions else None,
        #尿检
        "尿胆原": str(urine_urobilinogen.group(1)) if urine_urobilinogen else None,
        "尿胆红素": str(urine_bilirubin.group(1)) if urine_bilirubin else None,
        "酮体": str(urine_ketones.group(1)) if urine_ketones else None,
        "隐血": str(urine_blood.group(1)) if urine_blood else None,
        "尿蛋白": str(urine_protein.group(1)) if urine_protein else None,
        "亚硝酸盐": str(urine_nitrite.group(1)) if urine_nitrite else None,
        "白细胞": str(urine_leukocytes.group(1)) if urine_leukocytes else None,
        "尿比重": float(urine_specific_gravity.group(1)) if urine_specific_gravity else None,
        "酸碱度": str(urine_ph.group(1)) if urine_ph else None,
        "维生素c": float(vc.group(1)) if vc else None,
        "尿糖": str(urine_suger.group(1)) if urine_suger else None,
        "白细胞(10^9/L)": float(wbcs.group(1)) if wbcs else None,
        "红细胞(10^12)": float(rbc.group(1)) if rbc else None,
        "血红蛋白(g/L)": float(hemoglobin.group(1)) if hemoglobin else None,
        "红细胞压积(%)": float(hematocrit.group(1)) if hematocrit else None,
        "红细胞平均体积(fL)": float(rbc_volume.group(1)) if rbc_volume else None,
        "平均血红蛋白量(pg)": float(avg_hb_content.group(1)) if avg_hb_content else None,
        "平均血红蛋白浓度(g/L)": float(avg_hb_concentration.group(1)) if avg_hb_concentration else None,
        "红细胞分布宽度SD(fL)": float(rbc_distribution_width_sd.group(1)) if rbc_distribution_width_sd else None,
        "红细胞分布宽度CV(%)": float(rbc_distribution_width_cv.group(1)) if rbc_distribution_width_cv else None,
        "血小板(10^9/L)": float(platelets.group(1)) if platelets else None,
        "血小板压积(%)": float(血小板比容.group(1)) if 血小板比容 else None,
        "平均血小板体积": float(platelet_volume.group(1)) if platelet_volume else None,
        "大血小板比例": float(bigplatelet.group(1)) if bigplatelet else None,
        "中性粒细胞计数(10^9/L)": float(neutrophil_count.group(1)) if neutrophil_count else None,
        "淋巴细胞计数(10^9/L)": float(lymphocyte_count.group(1)) if lymphocyte_count else None,
        "单核细胞计数(10^9/L)": float(monocyte_count.group(1)) if monocyte_count else None,
        "嗜酸性粒细胞计数(10^9/L)": float(eosinophil_count.group(1)) if eosinophil_count else None,
        "嗜碱性粒细胞计数(10^9/L)": float(basophil_count.group(1)) if basophil_count else None,
        "中性粒细胞百分比(%)": float(neutrophil_percentage.group(1)) if neutrophil_percentage else None,
        "淋巴细胞百分比(%)": float(lymphocyte_percentage.group(1)) if lymphocyte_percentage else None,
        "单核细胞百分比(%)": float(monocyte_percentage.group(1)) if monocyte_percentage else None,
        "嗜酸性粒细胞百分比(%)": float(eosinophil_percentage.group(1)) if eosinophil_percentage else None,
        "嗜碱性粒细胞百分比(%)": float(basophil_percentage.group(1)) if basophil_percentage else None,
        "未成熟粒细胞绝对值": float(未成熟粒细胞绝对值.group(1)) if 未成熟粒细胞绝对值 else None,
        "未成熟粒细胞比值":float(未成熟粒细胞比值.group(1)) if 未成熟粒细胞比值 else None,
        "糖化血红蛋白(%)": float(glycosylated_hemoglobin.group(1)) if glycosylated_hemoglobin else None,
        "总胆红素(umol/L)": float(total_bilirubin.group(1)) if total_bilirubin else None,
        "直接胆红素(umol/L)": float(direct_bilirubin.group(1)) if direct_bilirubin else None,
        "间接胆红素(umol/L)": float(indirect_bilirubin.group(1)) if indirect_bilirubin else None,
        "总蛋白(g/L)": float(total_protein.group(1)) if total_protein else None,
        "白蛋白(g/L)": float(albumin.group(1)) if albumin else None,
        "球蛋白(g/L)": float(globulin.group(1)) if globulin else None,
        "白球比": float(albumin_globulin_ratio.group(1)) if albumin_globulin_ratio else None,
        "谷丙转氨酶(umol/L)": float(alt.group(1)) if alt else None,
        "尿素(mmol/L)": float(urea.group(1)) if urea else None,
        "肌酐(umol/L)": float(creatinine.group(1)) if creatinine else None,
        "尿酸(umol/L)": float(uric_acid.group(1)) if uric_acid else None,
        "二氧化碳(mmol/L)": float(co2.group(1)) if co2 else None,
        "胱抑素C(mg/L)": float(cystatin_c.group(1)) if cystatin_c else None,
        "B2微球蛋白(mg/L)": float(b2_microglobulin.group(1)) if b2_microglobulin else None,
        "总胆固醇(mmol/L)": float(total_cholesterol.group(1)) if total_cholesterol else None,
        "甘油三酯(mmol/L)": float(triglycerides.group(1)) if triglycerides else None,
        "高密度脂蛋白(mmol/L)": float(hdl.group(1)) if hdl else None,
        "低密度脂蛋白(mmol/L)": float(ldl.group(1)) if ldl else None,
        "动脉硬化指数": float(arteriosclerosis_index.group(1)) if arteriosclerosis_index else None,
        "葡萄糖(mmol/L)": float(glucose.group(1)) if glucose else None,
        "谷草转氨酶": float(ast.group(1)) if ast else None,
        "b超结果":str(Bresult.group(1)) if Bresult else None
    }

    return data

STEP4

批量化提取导出

# 批量化
data_list = []
pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

for pdf_file in pdf_files:
    pdf_path = os.path.join(directory_path, pdf_file)
    extracted_data = extract_data_from_pdf(pdf_path)
    extracted_data["文件名"] = pdf_file  # 添加文件名信息
    data_list.append(extracted_data)

# 转为DataFrame
df = pd.DataFrame(data_list)
# 导出为Excel文件
df.to_excel("二附院体检报告信息提取.xlsx", index=False)


#修改列名为redcap对应名
df.columns=['name','idcard','j2','j4','j5','j6','j7','j8','j9','j9_1','j10','j10_1','j11','j11_1','j12','j13','j14','j15','j16','j17','j18','j19''j20','j20_1','j21','j23','j24','j25','j25_1','j26_1','j27_1','j28_1','j29_1','j30_1','j31_1','j32_1','j33','j34','j35_1','j36_1','j37','j38','j39','j40','j41','j42','j43','j44','j45','j46','j47','j48','j49','j50','j51','j52','j53','j54','j55','j56','j57','j58','j59','j60','j62','j63','j64','j65','j66','j67','j68','j69','j70','j71','j72','j73','j74','j75','j87','j88','j76','j77','j78','j79','j80','j81','j71_1','j86_1','filename']
# 导出为Excel文件
df.to_excel("二附院体检报告信息提取redcap.xlsx", index=False)

STEP5：查看生成的excel文件检查错误并优化代码

暂无评论

发送评论编辑评论

Markdown

|´・ω・)ノ

ヾ(≧∇≦*)ゝ

(☆ω☆)

（╯‵□′）╯︵┴─┴

￣﹃￣

(/ω＼)

∠( ᐛ 」∠)＿

(๑•̀ㅁ•́ฅ)

→_→

୧(๑•̀⌄•́๑)૭

٩(ˊᗜˋ*)و

(ノ°ο°)ノ

(´இ皿இ｀)

⌇●﹏●⌇

(ฅ´ω`ฅ)

(╯°A°)╯︵○○○

φ(￣∇￣o)

ヾ(´･･｀｡)ノ"

( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃

(ó﹏ò｡)

Σ(っ °Д °;)っ

( ,,´･ω･)ﾉ"(´っω･｀｡)

╮(╯▽╰)╭

o(*////▽////*)q

＞﹏＜

( ๑´•ω•) "(ㆆᴗㆆ)

颜文字

Emoji

小恐龙

花!