#!/usr/bin/env python3 """ Read materials.csv, update bio_name to bio/.txt, parse Aggregate Mini Wiki.docx for numbered entries, match entries to candidate names from materials.csv (case/underscore differences allowed), extract Educational Background, Career, and Personal information sections, and save per-candidate files to bio/. Run from repository root where materials.csv and Aggregate Mini Wiki.docx live. """ import csv import os import re from docx import Document MATERIALS = "materials.csv" DOCX = "Aggregate Mini Wiki.docx" BIO_DIR = "bio" SECTION_HEADINGS = [ "Educational Background", "Education", "Career", "Professional Experience", "Personal information", "Personal Information", "Personal Info", ] def normalize_name(s: str) -> str: # Lowercase, replace non-alphanumeric with underscore, collapse underscores s = s or "" s = s.strip() s = s.replace('\\u2019', "") # drop fancy apostrophes if any s = re.sub(r"[.\\-]", "_", s) s = re.sub(r"\\s+", "_", s) s = re.sub(r"[^0-9a-zA-Z_]+", "", s) s = re.sub(r"_+", "_", s) return s.strip("_").lower() def extract_candidate_name_from_path(path: str) -> str | None: # path like image/Adam_Frisch.jpeg -> Adam_Frisch if not path: return None base = os.path.basename(path) # remove extension name, _ext = os.path.splitext(base) return name def convert_name_to_display(name: str) -> str: """Convert underscore name like Adam_Frisch to display name like Adam Frisch""" return name.replace('_', ' ') def read_materials(path=MATERIALS): rows = [] with open(path, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) headers = reader.fieldnames for r in reader: rows.append(dict(r)) return headers, rows def write_materials(headers, rows, path=MATERIALS): with open(path, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=headers) writer.writeheader() for r in rows: writer.writerow(r) def remove_hyperlinked_text(para): """Remove any text that contains hyperlinks from a paragraph""" # Get runs and check for hyperlinks cleaned_text = [] for run in para.runs: # Check if this run has a hyperlink by looking at the run's element has_hyperlink = False for child in run.element: if child.tag.endswith('hyperlink'): has_hyperlink = True break # Also check if the run is inside a hyperlink element parent = run.element.getparent() while parent is not None: if parent.tag.endswith('hyperlink'): has_hyperlink = True break parent = parent.getparent() if not has_hyperlink: cleaned_text.append(run.text) return ''.join(cleaned_text).strip() def parse_docx_by_candidate_names(docx_path, candidate_names): """ Search for each candidate name in the document and extract their bio. Names appear in underscore format like "Adam_Frisch". Returns a dict mapping normalized candidate name to (original_name, bio_text). """ doc = Document(docx_path) # Get all text with paragraph indices, removing hyperlinked text all_paragraphs = [] for idx, para in enumerate(doc.paragraphs): text = remove_hyperlinked_text(para) all_paragraphs.append((idx, text)) mapping = {} for candidate in candidate_names: # The candidate name format matches the document format (underscore) # So we can search directly norm_candidate = normalize_name(candidate) # Find paragraph that exactly matches this candidate's name found_idx = None found_text = None for idx, text in all_paragraphs: # Check for exact match with candidate name if text == candidate: found_idx = idx found_text = text break # Also try with spaces instead of underscores elif text == candidate.replace('_', ' '): found_idx = idx found_text = text break if found_idx is None: # Try case-insensitive match for idx, text in all_paragraphs: if text.lower() == candidate.lower(): found_idx = idx found_text = text break if found_idx is None: continue # Extract bio content after this name # Bio consists of sections: Educational Background, Career, Personal Information # Stop when we hit another candidate name (a short line that's just a name) bio_lines = [] # Known section headings that are NOT candidate names section_headings = ['Educational Background', 'Career', 'Personal Information'] for idx in range(found_idx + 1, len(all_paragraphs)): text = all_paragraphs[idx][1] if not text: bio_lines.append('') continue # Don't stop at section headings if text in section_headings: bio_lines.append(text) continue # Check if this looks like another candidate name # Candidate names have underscores (like Adam_Frisch) or are short capitalized names if len(text) < 50: # If it has underscores, likely a candidate name if '_' in text: words = text.replace('_', ' ').split() if len(words) >= 2 and len(words) <= 5: # This is likely another candidate, stop here break # If it's just 2-3 capitalized words without underscores, might also be a candidate elif len(text.split()) >= 2 and len(text.split()) <= 4: words = text.split() capitalized = sum(1 for w in words if w and w[0].isupper()) # All words capitalized and not a known section = likely a candidate if capitalized == len(words): # This is likely another candidate, stop here break bio_lines.append(text) bio_text = '\n'.join(bio_lines).strip() if bio_text: mapping[norm_candidate] = (found_text, bio_text) return mapping def clean_sentence_ending(line: str) -> str: """Remove trailing commas, empty parentheses, and (/) from a line""" line = line.rstrip() # Remove trailing comma if line.endswith(','): line = line[:-1].rstrip() # Remove trailing (/) if line.endswith('(/)'): line = line[:-3].rstrip() # Also remove comma that might be before (/) if line.endswith(','): line = line[:-1].rstrip() # Remove trailing empty parentheses while line.endswith('()'): line = line[:-2].rstrip() # Also remove comma that might be before the empty parentheses if line.endswith(','): line = line[:-1].rstrip() return line def add_bullet_points(text: str, section_titles: list) -> str: """Add bullet points to each non-empty line that isn't a section title""" lines = text.split('\n') result = [] for line in lines: stripped = line.strip() if not stripped: # Empty line - keep as is result.append('') elif stripped in section_titles: # Section title - keep as is result.append(stripped) else: # Content line - add bullet point result.append('• ' + stripped) return '\n'.join(result) def extract_sections_from_body(body_text: str) -> dict: """ Extract the three sections: Educational Background, Career, and Personal information. Section headings do NOT have colons after them. """ result = {"Educational Background":"", "Career":"", "Personal information":"", "full":""} result['full'] = body_text.strip() if not body_text: return result lines = body_text.splitlines() # Find section headings (exact match, case-insensitive, no colon) section_indices = [] # list of (line_idx, section_name) for idx, line in enumerate(lines): line_stripped = line.strip() if not line_stripped: continue # Check for exact matches to section headings if re.match(r'^Educational\s+Background$', line_stripped, re.IGNORECASE): section_indices.append((idx, 'Educational Background')) elif re.match(r'^Career$', line_stripped, re.IGNORECASE): section_indices.append((idx, 'Career')) elif re.match(r'^Personal\s+[Ii]nformation$', line_stripped, re.IGNORECASE): section_indices.append((idx, 'Personal information')) if section_indices: # Extract content between section headings section_indices.sort() for i, (line_idx, section_name) in enumerate(section_indices): start = line_idx + 1 # End is the start of the next section or end of document if i + 1 < len(section_indices): end = section_indices[i + 1][0] else: end = len(lines) section_lines = lines[start:end] section_text = '\n'.join(section_lines).strip() result[section_name] = section_text else: # No clear sections found, return full text result['Educational Background'] = body_text return result def main(): os.makedirs(BIO_DIR, exist_ok=True) headers, rows = read_materials() # 1) Extract candidate names from bio_name column (format: bio/Adam_Frisch.txt) candidate_names = [] for r in rows: bio_name = r.get('bio_name', '') or '' # Extract candidate name from bio/*.txt format if bio_name.startswith('bio/') and bio_name.lower().endswith('.txt'): # Extract name from bio/Candidate_Name.txt -> Candidate_Name name = os.path.splitext(os.path.basename(bio_name))[0] candidate_names.append(name) print(f"Extracted {len(candidate_names)} candidate names from {MATERIALS}") # 2) Parse docx by searching for candidate names unique_candidates = sorted(set(candidate_names)) # Remove the special cases unique_candidates = [c for c in unique_candidates if c not in ['*', 'fill_img']] mapping = parse_docx_by_candidate_names(DOCX, unique_candidates) print(f"Found {len(mapping)} candidates in {DOCX}") # 3) For each candidate, find match and extract sections saved = 0 unmatched = [] for candidate in unique_candidates: norm_candidate = normalize_name(candidate) # Look up in mapping if norm_candidate not in mapping: unmatched.append(candidate) continue orig_name, body = mapping[norm_candidate] sections = extract_sections_from_body(body) # Clean sentence endings for all sections section_titles = ['Educational Background', 'Career', 'Personal Information'] # Create file contents with clear headings parts = [] eb = sections.get('Educational Background', '').strip() car = sections.get('Career', '').strip() pi = sections.get('Personal information', '').strip() # Clean sentence endings in each section if eb: eb_lines = [clean_sentence_ending(line) for line in eb.split('\n')] eb = '\n'.join(eb_lines) parts.append('Educational Background\n' + eb) if car: car_lines = [clean_sentence_ending(line) for line in car.split('\n')] car = '\n'.join(car_lines) parts.append('Career\n' + car) if pi: pi_lines = [clean_sentence_ending(line) for line in pi.split('\n')] pi = '\n'.join(pi_lines) parts.append('Personal Information\n' + pi) if not parts: # fallback to full body body_lines = [clean_sentence_ending(line) for line in body.strip().split('\n')] parts = ['\n'.join(body_lines)] # Join sections and add bullet points out_text = '\n\n'.join(parts).strip() out_text = add_bullet_points(out_text, section_titles) # Add candidate name at the top (convert underscores to spaces) display_name = convert_name_to_display(candidate) out_text = display_name + '\n\n' + out_text out_path = os.path.join(BIO_DIR, f"{candidate}.txt") with open(out_path, 'w', encoding='utf-8') as f: f.write(out_text) saved += 1 print(f"Saved bio for {candidate} -> {out_path}") print(f"\nSaved {saved} bios. {len(unmatched)} unmatched candidates.") if unmatched: print("Unmatched candidates (no entry found in docx):") for u in unmatched[:50]: print(" - ", u) if len(unmatched) > 50: print(f"... and {len(unmatched) - 50} more") if __name__ == '__main__': main()