Source code for pyaota.generator.wordexport2rawyaml

#!/usr/bin/env python3
"""
Convert a ZyBooks-exported zip into a YAML multiple-choice question bank.

Supported zip layouts
---------------------
1. **QTI XML** (preferred): an ``items/`` folder containing one QTI v2.1
   XML file per question.
2. **Word documents** (legacy fallback): three ``.docx`` files following the
   ZyBooks naming convention.

Usage:
    python zybooks_zip_to_yaml.py export.zip output.yaml

Requirements:
    pip install python-docx pyyaml    (Word path only)
"""

import sys
import re
import shutil
import zipfile
import tempfile
import xml.etree.ElementTree as ET
from pathlib import Path

import yaml


# ---------- Utility: detect files in extracted zip ----------

[docs] def find_docs_in_extracted_dir(extract_dir: Path): """Return (questions_doc_path, answer_key_doc_path) from an extracted ZyBooks zip.""" docx_files = list(extract_dir.glob("*.docx")) if not docx_files: raise FileNotFoundError("No .docx files found in the extracted zip directory.") questions_doc = None answer_key_doc = None # Heuristics based on ZyBooks naming conventions for p in docx_files: name_lower = p.name.lower() if "answer_key" in name_lower: answer_key_doc = p elif "with_answers" in name_lower: # We typically don't need this; ignore continue else: # Likely the base questions file (e.g., Test_Test.docx) questions_doc = p if questions_doc is None: raise FileNotFoundError("Could not find the questions .docx (without 'answer_key' or 'with_answers' in the name).") if answer_key_doc is None: raise FileNotFoundError("Could not find the answer key .docx (name containing 'answer_key').") return questions_doc, answer_key_doc
# ---------- Parsing answer key ----------
[docs] def parse_answer_key(answer_doc_path: Path): """ Parse an answer-key .docx where each answer is on a line like:: 1) b 2) a Returns a dict mapping question number to answer letter (``'a'``--``'d'``). """ from docx import Document doc = Document(str(answer_doc_path)) answers = {} for p in doc.paragraphs: text = p.text.strip() m = re.match(r'^(\d+)\)\s*([a-dA-D])', text) if m: qnum = int(m.group(1)) letter = m.group(2).lower() answers[qnum] = letter return answers
# ---------- Parsing questions ----------
[docs] def paragraph_text(paragraph): """Extract text from a paragraph, inserting spaces between runs at font boundaries. Word splits text into "runs" at every formatting change. ``paragraph.text`` simply concatenates the run texts, which drops whitespace when a font change falls between two words (e.g. "Hello" in bold + "world" in normal becomes "Helloworld"). This helper inserts a single space between consecutive runs when neither side already has whitespace at the boundary. """ runs = paragraph.runs if not runs: return paragraph.text # fall back for edge cases (e.g. field codes) parts = [] for i, run in enumerate(runs): t = run.text if i > 0 and t and parts: prev = parts[-1] if prev and not prev[-1].isspace() and not t[0].isspace(): parts.append(" ") parts.append(t) return "".join(parts)
[docs] def is_question_start(text: str) -> bool: return re.match(r'^\d+\)\s', text.strip()) is not None
[docs] def is_choice_line(text: str) -> bool: return re.match(r'^[a-dA-D]\.\s', text.strip()) is not None
[docs] def parse_questions(questions_doc_path: Path, answers: dict): """ Parse the questions .docx into a list of dicts matching the YAML schema: - id: Q1 points: 1 type: mcq stem: [ {type: text|code, text: "...", ...}, ... ] choices: [ {key: 'a', text: '...'}, ... ] correct: 'b' """ from docx import Document doc = Document(str(questions_doc_path)) paras = [paragraph_text(p) for p in doc.paragraphs] n = len(paras) i = 0 questions = [] # Skip title or other non-question paragraphs at top while i < n and not is_question_start(paras[i]): i += 1 while i < n: line = paras[i].strip() if not is_question_start(line): i += 1 continue # Parse "N) ..." line m = re.match(r'^(\d+)\)\s*(.*)$', line) if not m: i += 1 continue qnum = int(m.group(1)) first_rest = m.group(2).strip() stem_blocks = [] choices = [] # If the first line has text after "N)", treat as stem text if first_rest: stem_blocks.append({ "type": "text", "text": first_rest, }) i += 1 # Collect stem paragraphs until we hit choices or next question while i < n: raw = paras[i] text = raw.strip() # Skip blank lines if not text: i += 1 continue # Next question? if is_question_start(text): break # Choices start? if is_choice_line(text): break print(raw) # Otherwise, part of stem if "\n" in raw: # Treat as code block with literal newlines stem_blocks.append({ "type": "code", "language": "python", # assumption for ENGR 131 "style": "mypython", # matches your LaTeX listings style "text": raw, }) else: stem_blocks.append({ "type": "text", "text": text, }) i += 1 # Collect choices if present (choices may span multiple paragraphs) while i < n: text = paras[i].strip() if not text: i += 1 continue if is_question_start(text): break m = re.match(r'^([a-dA-D])\.\s*(.*)$', text) if m: # Start a new choice choices.append({ "key": m.group(1).lower(), "text": m.group(2), }) elif choices: # Continuation line for the current choice choices[-1]["text"] += " " + text else: # Non-choice text before any choice was seen — stop break i += 1 # Build question dict qid = f"Q{qnum}" correct = answers.get(qnum) question = { "id": qid, "points": 1, "type": "mcq", "stem": stem_blocks, "choices": choices, "correct": correct, } questions.append(question) # After choices, loop continues; top of while will detect next question start return questions
# ---------- QTI v2.1 XML parsing ---------- _QTI_NS = "http://www.imsglobal.org/xsd/imsqti_v2p1" def _qti(tag): """Return a namespace-qualified tag name for QTI v2.1.""" return f"{{{_QTI_NS}}}{tag}" def _collapse_blank_lines(text): """Collapse runs of multiple blank lines into a single blank line.""" return re.sub(r'\n{3,}', '\n\n', text) _TOPIC_INDICATORS = { "strings": [ # stem/choice keywords r'\bstring\b', r'\bsubstring\b', r'\bcharacter\b', r'\bconcat', r'\bslice\b', r'\bslicing\b', r'\bf-string\b', r'\bformat', r'\bformatting\b', r'\bpresentation\s+type', # string methods r'\.split\b', r'\.join\b', r'\.strip\b', r'\.rstrip\b', r'\.lstrip\b', r'\.upper\b', r'\.lower\b', r'\.capitalize\b', r'\.title\b', r'\.find\b', r'\.rfind\b', r'\.index\b', r'\.count\b', r'\.replace\b', r'\.startswith\b', r'\.endswith\b', r'\.isdigit\b', r'\.isalpha\b', r'\.isalnum\b', r'\.isupper\b', r'\.islower\b', r'\.isspace\b', r'\.format\b', r'\.encode\b', r'\.decode\b', # string operations r'\[\s*-?\d*\s*:\s*-?\d*\s*(:\s*-?\d*\s*)?\]', # slicing [0:6], [-5:], [0:-1:2] # format specifiers (alignment/fill) r'\{[^}]*:[^}]*[<>^][^}]*\}', # {x:>4}, {x:*<4} # f-string / format() calls r'\bf"[^"]*"', r'\bf\'[^\']*\'', ], "functions": [ r'\bfunction', # matches "function" and "functions" r'\bparameter\b', r'\bargument\b', r'\bdef\s+\w+', r'\breturn\b', r'\bscope\b', r'\blocal\b', r'\bglobal\b', r'\blambda\b', r'\brecursion\b', r'\brecursive\b', r'\bdocstring', ], "files": [ r'\bfile\b', r'\bfilename\b', r'\bopen\s*\(', r'\.read\b', r'\.readline\b', r'\.readlines\b', r'\.write\b', r'\.writelines\b', r'\.close\b', r'\bwith\s+open\b', r"['\"][rwa]b?['\"]", # file modes 'r', 'w', 'a', 'rb' r'\bimport\s+csv\b', r'\bimport\s+json\b', r'\bcsv\.\w+', r'\bjson\.\w+', r'\bflush\b', r'\bbuffer\b', r'\bbytes\s*\(', r'\bbytes\s+literal', # bytes() calls ], "lists": [ r'\blist\b', r'\blist\s*\(', r'\.append\b', r'\.extend\b', r'\.insert\b', r'\.pop\b', r'\.remove\b', r'\.sort\b', r'\.reverse\b', r'\bsorted\b', r'\blen\b', r'\bfor\b.*\bin\b', # iteration r'\[.*\bfor\b.*\bin\b', # list comprehension r'\ball\s*\(', r'\bany\s*\(', # list aggregate builtins r'\bmin\s*\(', r'\bmax\s*\(', r'\w_list\b', # variable names like new_list, my_list ], "dictionaries": [ r'\bdict\b', r'\bdictionar', r'\.keys\b', r'\.values\b', r'\.items\b', r'\.get\b', r'\.update\b', r'\{\s*["\'\w]+\s*:\s*[^<>=*]', # dict literal {key: val} (exclude format specs) ], "modules": [ r'\bmodule\b', r'\bpackage\b', r'\blibrary\b', r'\bimport\s+\w+', r'\bfrom\s+\w+\s+import\b', r'\bsys\b', r'\bos\b', r'\bmath\b', r'\bsys\.argv\b', r'\b__name__\b', r'\bcommand.line\s+argument', r'\bpip\b', r'\brandom\.\w+', # random.randrange, random.randint r'\bnp\.\w+', r'\bnumpy\b', r'\bpandas\b', # numpy/pandas r'\bstruct\.\w+', # struct.pack, struct.unpack ], "classes": [ r'\bclass\s+\w+', r'\bself\.', r'\b__init__\b', r'\binherit', r'\bsubclass\b', r'\bobject\b', r'\binstance\b', r'\bconstructor\b', r'\battribute\b', r'\bmethod\b', r'\bsuper\s*\(', r'\b__\w+__\s*\(', # dunder method calls __mul__(, __mod__( r'\bisinstance\s*\(', r'\boverload', ], } def _detect_topic(stem_blocks, choices): """Classify a question into a topic based on stem and choice content.""" # Combine all textual content for scanning parts = [] for b in stem_blocks: parts.append(b.get("text", "")) for c in choices: parts.append(c.get("text", "")) blob = "\n".join(parts) scores = {} for topic, patterns in _TOPIC_INDICATORS.items(): scores[topic] = sum( len(re.findall(p, blob, re.IGNORECASE)) for p in patterns ) best = max(scores, key=scores.get) return best if scores[best] > 0 else "other" def _looks_like_code(text): """Heuristic: return True if *text* looks like a code expression. Checks for common Python syntax indicators such as format specifiers, method calls, brackets, assignment operators, etc. """ indicators = [ r'\{.*[:!].*\}', # format specifiers {x:>4}, {x!r} r'\w+\.\w+\(', # method calls foo.bar( r'\w+\(', # function calls upper(), len( r'\w+\[', # subscript arr[ r'\w+\s*[=!<>]=', # comparison/assign x == , x != r'\breturn\b|\bprint\b|\bimport\b', # unambiguous keywords r'\bTrue\b|\bFalse\b|\bNone\b', # builtins # compound-statement keywords only when they look like Python # statements (line ends with a colon) r'(?m)^\s*(?:class|def|if|elif|else|for|while|with|try|except|finally)\b.*:\s*$', ] return any(re.search(p, text) for p in indicators) def _wrap_inline_code(text): """Wrap string literals and Python identifiers in ``backtick`` markers. Detects double-quoted string literals and underscore-containing words (e.g. ``new_string``, ``my_list``) that are almost certainly Python identifiers. Skips regions already inside ``...`` backtick markers. """ # Split on ``...`` regions, only transform the outside parts parts = re.split(r'(``.*?``)', text) for i, part in enumerate(parts): if not part.startswith('``'): # Wrap double-quoted string literals part = re.sub(r'"([^"]+)"', r'``"\1"``', part) # Wrap underscore identifiers (e.g. new_string, my_list) part = re.sub(r'\b(\w+_\w+)\b', r'``\1``', part) parts[i] = part return "".join(parts) def _table_to_text(table_elem): """Convert a QTI ``<table>`` element into aligned plain-text rows. Returns a string that resembles ``pandas`` DataFrame output, suitable for inclusion as a ``code`` stem block. """ ns = _QTI_NS rows = [] for tr in table_elem.iter(f"{{{ns}}}tr"): cells = [] for td in tr.iter(f"{{{ns}}}td"): # A cell might contain nested <p> or bare text text = _element_text(td).strip() cells.append(text) rows.append(cells) if not rows: return "" # Pad all rows to the same width max_cols = max(len(r) for r in rows) for r in rows: while len(r) < max_cols: r.append("") # Column widths col_widths = [max(len(r[c]) for r in rows) for c in range(max_cols)] lines = [] for r in rows: line = " ".join(r[c].rjust(col_widths[c]) for c in range(max_cols)) lines.append(line) return "\n".join(lines) def _element_text(elem, *, inline_code=False): """Recursively extract text from an XML element, converting <br/> to newlines. Parameters ---------- inline_code : bool If True, wrap ``<code>`` content with double-backtick markers (``...``) so the downstream LaTeX renderer can typeset them with ``\\inl{}``. """ _BLOCK_TAGS = {"p", "div", "pre"} parts = [] if elem.text: parts.append(elem.text) for child in elem: local = child.tag.split("}")[-1] if "}" in child.tag else child.tag if local == "br": parts.append("\n") elif local == "code" and inline_code: code_text = _element_text(child) parts.append(f"``{code_text.strip()}``") else: # Insert a newline before block-level elements when content precedes them if local in _BLOCK_TAGS and parts and not parts[-1].endswith("\n"): parts.append("\n") parts.append(_element_text(child, inline_code=inline_code)) if child.tail: parts.append(child.tail) return "".join(parts)
[docs] def parse_qti_item(xml_path): """Parse a single QTI v2.1 assessmentItem XML file into a question dict. Returns None if the file is not a valid assessmentItem (e.g. manifest). """ tree = ET.parse(xml_path) root = tree.getroot() # Skip files that aren't assessmentItem elements root_local = root.tag.split("}")[-1] if "}" in root.tag else root.tag if root_local != "assessmentItem": return None qid = root.get("identifier", Path(xml_path).stem) # --- correct answer --- correct = None val = root.find(f"{_qti('responseDeclaration')}/{_qti('correctResponse')}/{_qti('value')}") if val is not None and val.text: correct = val.text.strip().lower() # --- item body --- item_body = root.find(_qti("itemBody")) if item_body is None: print(f" WARNING: {Path(xml_path).name} has no itemBody, skipping") return None stem_blocks = [] choices = [] # Stem: <p> and <img> elements inside top-level <div> (not inside choiceInteraction) for div in item_body.findall(_qti("div")): for child in div: child_local = child.tag.split("}")[-1] if "}" in child.tag else child.tag if child_local == "img": # Image element — record as image stem block src = child.get("src", "") alt = child.get("alt", "") width_attr = child.get("width", "") stem_blocks.append({ "type": "image", "src": src, "alt": alt, "width": width_attr, }) continue if child_local == "table": table_text = _table_to_text(child) if table_text: stem_blocks.append({ "type": "code", "language": "text", "style": "mypython", "text": table_text, }) continue if child_local != "p": continue p = child code = p.find(_qti("code")) if code is not None: # Distinguish block code (<p> is *only* a code listing) # from inline code (<code> mixed with surrounding text). has_surrounding_text = bool( (p.text and p.text.strip()) or (code.tail and code.tail.strip()) ) if has_surrounding_text: # Inline code — render as text with ``backtick`` markers text = _wrap_inline_code(_element_text(p, inline_code=True).strip()) if text: stem_blocks.append({"type": "text", "text": text}) else: # Block code listing code_text = _collapse_blank_lines(_element_text(code).strip()) stem_blocks.append({ "type": "code", "language": "python", "style": "mypython", "text": code_text, }) else: text = _wrap_inline_code(_element_text(p).strip()) if text: stem_blocks.append({"type": "text", "text": text}) # Merge consecutive text blocks where the second is a sentence # continuation (ZyBooks splits <p> at inline-element boundaries). merged = [] for block in stem_blocks: if (merged and merged[-1].get("type") == "text" and block.get("type") == "text"): text = block["text"] if text and (text[0].islower() or text[0] in ',.?!;:'): if text[0] in ',.?!;:': merged[-1]["text"] += text else: merged[-1]["text"] += " " + text continue merged.append(block) stem_blocks = merged # Determine if the stem implies choices are code/output stem_text_joined = " ".join( b["text"] for b in stem_blocks if b.get("type") == "text" ).lower() stem_implies_code = bool(re.search( r"what is (the )?(output|result|value)" r"|what does .* (print|output|return)" r"|what will .* (print|output|display)" r"|complete the (following )?code" r"|which .* (line|statement|expression|code|method)" r"|presentation type" r"|as output", stem_text_joined, )) # Choices ci = item_body.find(_qti("choiceInteraction")) if ci is not None: for sc in ci.findall(_qti("simpleChoice")): key = sc.get("identifier", "").lower() # Check if this individual choice contains <code> has_code_elem = sc.find(f".//{_qti('code')}") is not None text = _element_text(sc).strip() if has_code_elem: text = _collapse_blank_lines(text) choice = {"key": key, "text": text} if has_code_elem or stem_implies_code or _looks_like_code(text): choice["type"] = "code" choices.append(choice) topic = _detect_topic(stem_blocks, choices) return { "id": qid, "points": 1, "type": "mcq", "topic": topic, "stem": stem_blocks, "choices": choices, "correct": correct, }
def _numeric_sort_key(path): """Extract the trailing integer from a filename for numeric sorting. E.g. ``qti_item_VE_IP_42.xml`` -> 42. Files without a number sort last. """ m = re.search(r'(\d+)\.xml$', path.name) return int(m.group(1)) if m else float('inf')
[docs] def parse_qti_items_dir(items_dir): """Parse all QTI XML files in a directory, returning a numerically sorted list.""" xml_files = sorted(items_dir.glob("*.xml"), key=_numeric_sort_key) if not xml_files: raise FileNotFoundError(f"No .xml files found in {items_dir}") questions = [] for xf in xml_files: q = parse_qti_item(xf) if q is not None: questions.append(q) # Assign zero-padded sequential IDs width = len(str(len(questions))) for i, q in enumerate(questions, 1): q["id"] = f"Q{i:0{width}d}" print(f"Parsed {len(questions)} QTI items from {items_dir} ({len(xml_files) - len(questions)} skipped)") return questions
# ---------- Main entry points ----------
[docs] def convert(zip_path, output_yaml_path): """Convert a ZyBooks-exported zip into a YAML question bank. Auto-detects the format: if the zip contains an ``items/`` directory with XML files, the QTI parser is used. Otherwise, falls back to the Word document parser. Parameters ---------- zip_path : str or Path Path to the exported zip file. output_yaml_path : str or Path Path to the output YAML file. """ zip_path = Path(zip_path) output_yaml_path = Path(output_yaml_path) if not zip_path.is_file(): raise FileNotFoundError(f"{zip_path} does not exist or is not a file.") with tempfile.TemporaryDirectory() as tmpdir: tmpdir_path = Path(tmpdir) with zipfile.ZipFile(zip_path, 'r') as zf: zf.extractall(tmpdir_path) items_dir = tmpdir_path / "items" if items_dir.is_dir() and list(items_dir.glob("*.xml")): questions = parse_qti_items_dir(items_dir) else: questions_doc, answer_key_doc = find_docs_in_extracted_dir(tmpdir_path) print(f"Using questions doc: {questions_doc.name}") print(f"Using answer key doc: {answer_key_doc.name}") answers = parse_answer_key(answer_key_doc) questions = parse_questions(questions_doc, answers) # Collect image references and copy files alongside the YAML images_dir = output_yaml_path.parent / "images" image_count = 0 for q in questions: for block in q.get("stem", []): if block.get("type") == "image": src = block["src"] src_path = items_dir / src if items_dir.is_dir() else None if src_path and src_path.is_file(): images_dir.mkdir(parents=True, exist_ok=True) shutil.copy2(src_path, images_dir / src) image_count += 1 else: print(f" WARNING: image {src} referenced but not found in zip") data = {"questions": questions} with output_yaml_path.open("w", encoding="utf-8") as f: yaml.safe_dump( data, f, sort_keys=False, allow_unicode=True, width=80, ) print(f"Wrote {len(questions)} questions to {output_yaml_path}") if image_count: print(f"Copied {image_count} image(s) to {images_dir}")
[docs] def convert_subcommand(args): """CLI subcommand wrapper for :func:`convert`.""" convert(args.input_zip, args.output_yaml) return 0
[docs] def main(): if len(sys.argv) != 3: print("Usage: python zybooks_zip_to_yaml.py export.zip output.yaml") sys.exit(1) convert(sys.argv[1], sys.argv[2])
if __name__ == "__main__": main()