Python文件处理(1)-电子病例报告PDF信息提取

所用包

import os
import re
import pandas as pd
from PyPDF2 import PdfReader
from pathlib import Path
import openpyxl

STEP1

pdf信息转text文本

filepath="青春末期随访全pdf/xxxx.pdf"
reader = PdfReader(filepath)
    text = ""
for page in reader.pages:
    text += page.extract_text()

STEP2

测试并优化正则表达式

re.search(r'直接胆红素.*?DBIL.*?\s*([\d\.]+)', text)
eosinophil_count = re.search(r'淋巴细胞.*?\(LY\)\s*([\d\.]+)', text)
urine_blood = re.search(r'尿液分析[\s\S]*?隐血.*?([\d\+-]+)', text)

STEP3

构建单病例报告提取函数

def extract_data_from_pdf(pdf_path):
reader = PdfReader(pdf_path)
text = ""

# 合并所有页面的文本
for page in reader.pages:
text += page.extract_text()

#基本信息
name =re.search(r'姓\s+名: (\w+)',text)
date =re.search(r'体检日期\s*([\d-]+)',text)
phone =re.search(r'电 话\s*(\d+)',text)
idcard =re.search(r'证件号\s*(\d+)',text)

# 一般检查
height = re.search(r'([\d\.]+)\s*cm\s*身高',text)
weight = re.search(r'([\d\.]+)\s*Kg\s*体重', text)
waist_circumference = re.search(r'([\d\.]+)\s*cm\s*腰围', text)
hip_circumference = re.search(r'([\d\.]+)\s*cm\s*臀围', text)
systolic_bp = re.search(r'([\d\.]+)\s*mmHg\s*收缩压', text)
systolic_bp2 = re.search(r'([\d\.]+)\s*mmHg\s*收缩压第二次测量', text)
diastolic_bp = re.search(r'([\d\.]+)\s*mmHg\s*舒张压', text)
diastolic_bp2 = re.search(r'([\d\.]+)\s*mmHg\s*舒张压第二次测量', text)
heart_rate = re.search(r'([\d\.]+)\s*脉搏', text)
heart_rate2 = re.search(r'([\d\.]+)\s*心率第二次测量', text)
bmi = re.search(r'(\d{2}\.?\d?)\s体重指数', text)

#体成分
body_weight = re.search(r'体重(体成分仪)\s*([\d\.]+)\s*kg', text)
fat_mass = re.search(r'脂肪量\s*([\d\.]+)\s*kg', text)
body_fat_rate = re.search(r'体脂率\s*([\d\.]+)\s*%', text)
fat_free_weight = re.search(r'去脂体重\s*([\d\.]+)\s*kg', text)
muscle_mass = re.search(r'肌肉量\s*([\d\.]+)\s*kg', text)
body_water_weight = re.search(r'体水分比\s*([\d\.]+)\s*%', text)
bone_mass = re.search(r'骨量\s*([\d\.]+)\s*kg', text)
basal_metabolism = re.search(r'基础代谢\s*([\d\.]+)\s*kcal', text)
visceral_fat_level = re.search(r'内脏脂肪等级\s*([\d]+)', text)


#便检
stool_color = re.search(r'大便颜色\s*(\w+)', text)
stool_properties = re.search(r'大便性状\s*(\w+)', text)
stool_microscopy = re.search(r'大便镜检\s*([^\n]+)', text)
abnormal_conditions = re.search(r'异常情况描述\s*([^\n]+)', text)

#尿检
urine_urobilinogen = re.search(r'尿胆原URO.*([\d\+-]+)', text)
urine_bilirubin = re.search(r'胆红素BIL.*([\d\+-]+)', text)
urine_ketones = re.search(r'酮体KET.*([\d\+-]+)', text)
urine_blood = re.search(r'尿液分析[\s\S]*?隐血.*?([\d\+-]+)', text)
urine_protein = re.search(r'尿蛋白.*([\d\+-]+)', text)
urine_nitrite = re.search(r'亚硝酸盐.*?NIT.*?([\d\+-]+)', text)
urine_leukocytes = re.search(r'尿液分析[\s\S]*?白细胞\s*([\d\+-]+)', text)
urine_specific_gravity = re.search(r'尿比重SG\s*([\d\.]+)', text)
urine_ph = re.search(r'酸碱度PH\s*([\d\.]+)', text)
vc= re.search(r'维生素c.*([\d\+-]+)', text)
urine_suger = re.search(r'尿糖.*([\d\+-]+)', text)
# 血液检测
wbcs = re.search(r'血常规白细胞计数\s*([\d\.]+)', text)
rbc = re.search(r'血常规红细胞计数\s*([\d\.]+)', text)
hemoglobin = re.search(r'血红蛋白浓度\s*([\d\.]+)', text)
hematocrit = re.search(r'红细胞比容\s*([\d\.]+)', text)
rbc_volume = re.search(r'平均红细胞体积\s*([\d\.]+)', text)
avg_hb_content = re.search(r'平均RBC血红蛋白含量\s*([\d\.]+)', text)
avg_hb_concentration = re.search(r'平均RBC血红蛋白浓度\s*([\d\.]+)', text)
rbc_distribution_width_sd = re.search(r'红细胞体积分布宽度.*?SD\s*([\d\.]+)', text)
rbc_distribution_width_cv = re.search(r'红细胞体积分布宽度.*?CV\s*([\d\.]+)', text)
platelets = re.search(r'血小板计数\s*([\d\.]+)', text)
血小板比容 = re.search(r'血小板比容\s*([\d\.]+)', text)
platelet_distribution_width =re.search(r'血小板分布宽度\s*([\d\.]+)', text)
platelet_volume = re.search(r'平均血小板体积\s*([\d\.]+)', text)
bigplatelet = re.search(r'大血小板比例\s*([\d\.]+)', text)


# 血液细胞计数
neutrophil_count = re.search(r'中性粒细胞计数\s*([\d\.]+)', text)
lymphocyte_count = re.search(r'淋巴细胞.*?\(LY\)\s*([\d\.]+)', text)
monocyte_count = re.search(r'单核细胞计数\s*([\d\.]+)', text)
eosinophil_count = re.search(r'嗜酸细胞计数\s*([\d\.]+)', text)
basophil_count = re.search(r'嗜碱细胞计数\s*([\d\.]+)', text)
neutrophil_percentage = re.search(r'中性粒细胞百分比\s*([\d\.]+)', text)
lymphocyte_percentage = re.search(r'淋巴细胞百分比\s*([\d\.]+)', text)
monocyte_percentage = re.search(r'单核细胞百分比\s*([\d\.]+)', text)
eosinophil_percentage = re.search(r'嗜酸粒细胞百分比\s*([\d\.]+)', text)
basophil_percentage = re.search(r'嗜碱粒细胞百分比\s*([\d\.]+)', text)
未成熟粒细胞绝对值 = re.search(r'未成熟粒细胞绝对值\s*([\d\.]+)', text)
未成熟粒细胞比值 = re.search(r'未成熟粒细胞比值\s*([\d\.]+)', text)

# 糖化血红蛋白和胆红素
glycosylated_hemoglobin = re.search(r'糖化血红蛋白.*?\s*([\d\.]+)', text)
total_bilirubin = re.search(r'\(TBIL\)\s*([\d\.]+)', text)
direct_bilirubin = re.search(r'直接胆红素.*?DBIL.*?\s*([\d\.]+)', text)
indirect_bilirubin = re.search(r'间接胆红素.*?IBIL.*?\s*([\d\.]+)', text)
total_protein = re.search(r'总蛋白\s*\(([\d\.]+) g/L\)', text)
albumin = re.search(r'白蛋白\s*\(([\d\.]+) g/L\)', text)
globulin = re.search(r'球蛋白\s*\(([\d\.]+) g/L\)', text)
albumin_globulin_ratio = re.search(r'白球比\s*([\d\.]+)', text)

# 其他血液生化指标
alt = re.search(r'谷丙转氨酶.*?\s*([\d\.]+)', text)
urea = re.search(r'尿素\(Brea\)\s*([\d\.]+)', text)
creatinine = re.search(r'肌酐\(Cr\)\s*([\d\.]+)', text)
uric_acid = re.search(r'尿酸\(UA\)\s*([\d\.]+)', text)
co2 = re.search(r'二氧化碳\s*([\d\.]+)', text)
cystatin_c = re.search(r'胱抑素\s*([\d\.]+)', text)
b2_microglobulin = re.search(r'B2微球蛋白\s*([\d\.]+)', text)

# 血脂四项
total_cholesterol = re.search(r'总胆固醇\s*([\d\.]+)', text)
triglycerides = re.search(r'甘油三酯\s*([\d\.]+)', text)
hdl = re.search(r'高密度脂蛋白\s*([\d\.]+)', text)
ldl = re.search(r'低密度脂蛋白\s*([\d\.]+)', text)

# 动脉硬化指数和葡萄糖
arteriosclerosis_index = re.search(r'动脉硬化指数\s*([\d\.]+)', text)
glucose = re.search(r'葡萄糖\s*\(([\d\.]+) mmol/L\)', text)

# 谷草转氨酶
ast = re.search(r'谷草转氨酶.*?\s*([\d\.]+)', text)

# b超诊断结果
Bresult = re.search(r'2\.肝胆胰脾双肾彩色 B超.*?:(.+)\s', text)




# 将提取的数据转为字典格式
data = {
#基本信息
"姓名": str(name.group(1)) if name else None,
"身份证": str(idcard.group(1)) if idcard else None,
"检查日期": str(date.group(1)) if date else None,
"手机号": str(phone.group(1)) if phone else None,
#一般检查
"身高": float(height.group(1)) if height else None,
"体重": float(weight.group(1)) if weight else None,
"腰围": float(waist_circumference.group(1)) if waist_circumference else None,
"臀围": float(hip_circumference.group(1)) if hip_circumference else None,
"收缩压": float(systolic_bp.group(1)) if systolic_bp else None,
"收缩压2": float(systolic_bp2.group(1)) if systolic_bp2 else None,
"舒张压": float(diastolic_bp.group(1)) if diastolic_bp else None,
"舒张压2": float(diastolic_bp2.group(1)) if diastolic_bp2 else None,
"心率": float(heart_rate.group(1)) if heart_rate else None,
"心率2": float(heart_rate2.group(1)) if heart_rate2 else None,
#体成分
"体重(体成分仪)": float(body_weight.group(1)) if body_weight else None,
"脂肪量": float(fat_mass.group(1)) if fat_mass else None,
"体脂率": float(body_fat_rate.group(1)) if body_fat_rate else None,
"去脂体重": float(fat_free_weight.group(1)) if fat_free_weight else None,
"肌肉量": float(muscle_mass.group(1)) if muscle_mass else None,
"体水分比": float(body_water_weight.group(1)) if body_water_weight else None,
"骨量": float(bone_mass.group(1)) if bone_mass else None,
"基础代谢": float(basal_metabolism.group(1)) if basal_metabolism else None,
"内脏脂肪等级": int(visceral_fat_level.group(1)) if visceral_fat_level else None,
"BMI": float(bmi.group(1)) if bmi else None,
#便检
"大便颜色": stool_color.group(1).strip() if stool_color else None,
"大便性状": stool_properties.group(1).strip() if stool_properties else None,
"大便镜检": stool_microscopy.group(1).strip() if stool_microscopy else None,
"异常情况描述": abnormal_conditions.group(1).strip() if abnormal_conditions else None,
#尿检
"尿胆原": str(urine_urobilinogen.group(1)) if urine_urobilinogen else None,
"尿胆红素": str(urine_bilirubin.group(1)) if urine_bilirubin else None,
"酮体": str(urine_ketones.group(1)) if urine_ketones else None,
"隐血": str(urine_blood.group(1)) if urine_blood else None,
"尿蛋白": str(urine_protein.group(1)) if urine_protein else None,
"亚硝酸盐": str(urine_nitrite.group(1)) if urine_nitrite else None,
"白细胞": str(urine_leukocytes.group(1)) if urine_leukocytes else None,
"尿比重": float(urine_specific_gravity.group(1)) if urine_specific_gravity else None,
"酸碱度": str(urine_ph.group(1)) if urine_ph else None,
"维生素c": float(vc.group(1)) if vc else None,
"尿糖": str(urine_suger.group(1)) if urine_suger else None,
"白细胞(10^9/L)": float(wbcs.group(1)) if wbcs else None,
"红细胞(10^12)": float(rbc.group(1)) if rbc else None,
"血红蛋白(g/L)": float(hemoglobin.group(1)) if hemoglobin else None,
"红细胞压积(%)": float(hematocrit.group(1)) if hematocrit else None,
"红细胞平均体积(fL)": float(rbc_volume.group(1)) if rbc_volume else None,
"平均血红蛋白量(pg)": float(avg_hb_content.group(1)) if avg_hb_content else None,
"平均血红蛋白浓度(g/L)": float(avg_hb_concentration.group(1)) if avg_hb_concentration else None,
"红细胞分布宽度SD(fL)": float(rbc_distribution_width_sd.group(1)) if rbc_distribution_width_sd else None,
"红细胞分布宽度CV(%)": float(rbc_distribution_width_cv.group(1)) if rbc_distribution_width_cv else None,
"血小板(10^9/L)": float(platelets.group(1)) if platelets else None,
"血小板压积(%)": float(血小板比容.group(1)) if 血小板比容 else None,
"平均血小板体积": float(platelet_volume.group(1)) if platelet_volume else None,
"大血小板比例": float(bigplatelet.group(1)) if bigplatelet else None,
"中性粒细胞计数(10^9/L)": float(neutrophil_count.group(1)) if neutrophil_count else None,
"淋巴细胞计数(10^9/L)": float(lymphocyte_count.group(1)) if lymphocyte_count else None,
"单核细胞计数(10^9/L)": float(monocyte_count.group(1)) if monocyte_count else None,
"嗜酸性粒细胞计数(10^9/L)": float(eosinophil_count.group(1)) if eosinophil_count else None,
"嗜碱性粒细胞计数(10^9/L)": float(basophil_count.group(1)) if basophil_count else None,
"中性粒细胞百分比(%)": float(neutrophil_percentage.group(1)) if neutrophil_percentage else None,
"淋巴细胞百分比(%)": float(lymphocyte_percentage.group(1)) if lymphocyte_percentage else None,
"单核细胞百分比(%)": float(monocyte_percentage.group(1)) if monocyte_percentage else None,
"嗜酸性粒细胞百分比(%)": float(eosinophil_percentage.group(1)) if eosinophil_percentage else None,
"嗜碱性粒细胞百分比(%)": float(basophil_percentage.group(1)) if basophil_percentage else None,
"未成熟粒细胞绝对值": float(未成熟粒细胞绝对值.group(1)) if 未成熟粒细胞绝对值 else None,
"未成熟粒细胞比值":float(未成熟粒细胞比值.group(1)) if 未成熟粒细胞比值 else None,
"糖化血红蛋白(%)": float(glycosylated_hemoglobin.group(1)) if glycosylated_hemoglobin else None,
"总胆红素(umol/L)": float(total_bilirubin.group(1)) if total_bilirubin else None,
"直接胆红素(umol/L)": float(direct_bilirubin.group(1)) if direct_bilirubin else None,
"间接胆红素(umol/L)": float(indirect_bilirubin.group(1)) if indirect_bilirubin else None,
"总蛋白(g/L)": float(total_protein.group(1)) if total_protein else None,
"白蛋白(g/L)": float(albumin.group(1)) if albumin else None,
"球蛋白(g/L)": float(globulin.group(1)) if globulin else None,
"白球比": float(albumin_globulin_ratio.group(1)) if albumin_globulin_ratio else None,
"谷丙转氨酶(umol/L)": float(alt.group(1)) if alt else None,
"尿素(mmol/L)": float(urea.group(1)) if urea else None,
"肌酐(umol/L)": float(creatinine.group(1)) if creatinine else None,
"尿酸(umol/L)": float(uric_acid.group(1)) if uric_acid else None,
"二氧化碳(mmol/L)": float(co2.group(1)) if co2 else None,
"胱抑素C(mg/L)": float(cystatin_c.group(1)) if cystatin_c else None,
"B2微球蛋白(mg/L)": float(b2_microglobulin.group(1)) if b2_microglobulin else None,
"总胆固醇(mmol/L)": float(total_cholesterol.group(1)) if total_cholesterol else None,
"甘油三酯(mmol/L)": float(triglycerides.group(1)) if triglycerides else None,
"高密度脂蛋白(mmol/L)": float(hdl.group(1)) if hdl else None,
"低密度脂蛋白(mmol/L)": float(ldl.group(1)) if ldl else None,
"动脉硬化指数": float(arteriosclerosis_index.group(1)) if arteriosclerosis_index else None,
"葡萄糖(mmol/L)": float(glucose.group(1)) if glucose else None,
"谷草转氨酶": float(ast.group(1)) if ast else None,
"b超结果":str(Bresult.group(1)) if Bresult else None
}

return data

STEP4

批量化提取导出

# 批量化
data_list = []
pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

for pdf_file in pdf_files:
pdf_path = os.path.join(directory_path, pdf_file)
extracted_data = extract_data_from_pdf(pdf_path)
extracted_data["文件名"] = pdf_file # 添加文件名信息
data_list.append(extracted_data)

# 转为DataFrame
df = pd.DataFrame(data_list)
# 导出为Excel文件
df.to_excel("二附院体检报告信息提取.xlsx", index=False)


#修改列名为redcap对应名
df.columns=['name','idcard','j2','j4','j5','j6','j7','j8','j9','j9_1','j10','j10_1','j11','j11_1','j12','j13','j14','j15','j16','j17','j18','j19''j20','j20_1','j21','j23','j24','j25','j25_1','j26_1','j27_1','j28_1','j29_1','j30_1','j31_1','j32_1','j33','j34','j35_1','j36_1','j37','j38','j39','j40','j41','j42','j43','j44','j45','j46','j47','j48','j49','j50','j51','j52','j53','j54','j55','j56','j57','j58','j59','j60','j62','j63','j64','j65','j66','j67','j68','j69','j70','j71','j72','j73','j74','j75','j87','j88','j76','j77','j78','j79','j80','j81','j71_1','j86_1','filename']
# 导出为Excel文件
df.to_excel("二附院体检报告信息提取redcap.xlsx", index=False)

STEP5:查看生成的excel文件检查错误并优化代码

暂无评论

发送评论 编辑评论


				
|´・ω・)ノ
ヾ(≧∇≦*)ゝ
(☆ω☆)
(╯‵□′)╯︵┴─┴
 ̄﹃ ̄
(/ω\)
∠( ᐛ 」∠)_
(๑•̀ㅁ•́ฅ)
→_→
୧(๑•̀⌄•́๑)૭
٩(ˊᗜˋ*)و
(ノ°ο°)ノ
(´இ皿இ`)
⌇●﹏●⌇
(ฅ´ω`ฅ)
(╯°A°)╯︵○○○
φ( ̄∇ ̄o)
ヾ(´・ ・`。)ノ"
( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃
(ó﹏ò。)
Σ(っ °Д °;)っ
( ,,´・ω・)ノ"(´っω・`。)
╮(╯▽╰)╭
o(*////▽////*)q
>﹏<
( ๑´•ω•) "(ㆆᴗㆆ)
😂
😀
😅
😊
🙂
🙃
😌
😍
😘
😜
😝
😏
😒
🙄
😳
😡
😔
😫
😱
😭
💩
👻
🙌
🖕
👍
👫
👬
👭
🌚
🌝
🙈
💊
😶
🙏
🍦
🍉
😣
Source: github.com/k4yt3x/flowerhd
颜文字
Emoji
小恐龙
花!
上一篇
下一篇