#!/usr/bin/env python3
"""
Convert a ZyBooks-exported zip into a YAML multiple-choice question bank.
Supported zip layouts
---------------------
1. **QTI XML** (preferred): an ``items/`` folder containing one QTI v2.1
XML file per question.
2. **Word documents** (legacy fallback): three ``.docx`` files following the
ZyBooks naming convention.
Usage:
python zybooks_zip_to_yaml.py export.zip output.yaml
Requirements:
pip install python-docx pyyaml (Word path only)
"""
import sys
import re
import shutil
import zipfile
import tempfile
import xml.etree.ElementTree as ET
from pathlib import Path
import yaml
# ---------- Utility: detect files in extracted zip ----------
# ---------- Parsing answer key ----------
[docs]
def parse_answer_key(answer_doc_path: Path):
"""
Parse an answer-key .docx where each answer is on a line like::
1) b
2) a
Returns a dict mapping question number to answer letter (``'a'``--``'d'``).
"""
from docx import Document
doc = Document(str(answer_doc_path))
answers = {}
for p in doc.paragraphs:
text = p.text.strip()
m = re.match(r'^(\d+)\)\s*([a-dA-D])', text)
if m:
qnum = int(m.group(1))
letter = m.group(2).lower()
answers[qnum] = letter
return answers
# ---------- Parsing questions ----------
[docs]
def paragraph_text(paragraph):
"""Extract text from a paragraph, inserting spaces between runs at font boundaries.
Word splits text into "runs" at every formatting change. ``paragraph.text``
simply concatenates the run texts, which drops whitespace when a font change
falls between two words (e.g. "Hello" in bold + "world" in normal becomes
"Helloworld"). This helper inserts a single space between consecutive runs
when neither side already has whitespace at the boundary.
"""
runs = paragraph.runs
if not runs:
return paragraph.text # fall back for edge cases (e.g. field codes)
parts = []
for i, run in enumerate(runs):
t = run.text
if i > 0 and t and parts:
prev = parts[-1]
if prev and not prev[-1].isspace() and not t[0].isspace():
parts.append(" ")
parts.append(t)
return "".join(parts)
[docs]
def is_question_start(text: str) -> bool:
return re.match(r'^\d+\)\s', text.strip()) is not None
[docs]
def is_choice_line(text: str) -> bool:
return re.match(r'^[a-dA-D]\.\s', text.strip()) is not None
[docs]
def parse_questions(questions_doc_path: Path, answers: dict):
"""
Parse the questions .docx into a list of dicts matching the YAML schema:
- id: Q1
points: 1
type: mcq
stem: [ {type: text|code, text: "...", ...}, ... ]
choices: [ {key: 'a', text: '...'}, ... ]
correct: 'b'
"""
from docx import Document
doc = Document(str(questions_doc_path))
paras = [paragraph_text(p) for p in doc.paragraphs]
n = len(paras)
i = 0
questions = []
# Skip title or other non-question paragraphs at top
while i < n and not is_question_start(paras[i]):
i += 1
while i < n:
line = paras[i].strip()
if not is_question_start(line):
i += 1
continue
# Parse "N) ..." line
m = re.match(r'^(\d+)\)\s*(.*)$', line)
if not m:
i += 1
continue
qnum = int(m.group(1))
first_rest = m.group(2).strip()
stem_blocks = []
choices = []
# If the first line has text after "N)", treat as stem text
if first_rest:
stem_blocks.append({
"type": "text",
"text": first_rest,
})
i += 1
# Collect stem paragraphs until we hit choices or next question
while i < n:
raw = paras[i]
text = raw.strip()
# Skip blank lines
if not text:
i += 1
continue
# Next question?
if is_question_start(text):
break
# Choices start?
if is_choice_line(text):
break
print(raw)
# Otherwise, part of stem
if "\n" in raw:
# Treat as code block with literal newlines
stem_blocks.append({
"type": "code",
"language": "python", # assumption for ENGR 131
"style": "mypython", # matches your LaTeX listings style
"text": raw,
})
else:
stem_blocks.append({
"type": "text",
"text": text,
})
i += 1
# Collect choices if present (choices may span multiple paragraphs)
while i < n:
text = paras[i].strip()
if not text:
i += 1
continue
if is_question_start(text):
break
m = re.match(r'^([a-dA-D])\.\s*(.*)$', text)
if m:
# Start a new choice
choices.append({
"key": m.group(1).lower(),
"text": m.group(2),
})
elif choices:
# Continuation line for the current choice
choices[-1]["text"] += " " + text
else:
# Non-choice text before any choice was seen — stop
break
i += 1
# Build question dict
qid = f"Q{qnum}"
correct = answers.get(qnum)
question = {
"id": qid,
"points": 1,
"type": "mcq",
"stem": stem_blocks,
"choices": choices,
"correct": correct,
}
questions.append(question)
# After choices, loop continues; top of while will detect next question start
return questions
# ---------- QTI v2.1 XML parsing ----------
_QTI_NS = "http://www.imsglobal.org/xsd/imsqti_v2p1"
def _qti(tag):
"""Return a namespace-qualified tag name for QTI v2.1."""
return f"{{{_QTI_NS}}}{tag}"
def _collapse_blank_lines(text):
"""Collapse runs of multiple blank lines into a single blank line."""
return re.sub(r'\n{3,}', '\n\n', text)
_TOPIC_INDICATORS = {
"strings": [
# stem/choice keywords
r'\bstring\b', r'\bsubstring\b', r'\bcharacter\b', r'\bconcat',
r'\bslice\b', r'\bslicing\b', r'\bf-string\b', r'\bformat',
r'\bformatting\b', r'\bpresentation\s+type',
# string methods
r'\.split\b', r'\.join\b', r'\.strip\b', r'\.rstrip\b', r'\.lstrip\b',
r'\.upper\b', r'\.lower\b', r'\.capitalize\b', r'\.title\b',
r'\.find\b', r'\.rfind\b', r'\.index\b', r'\.count\b',
r'\.replace\b', r'\.startswith\b', r'\.endswith\b',
r'\.isdigit\b', r'\.isalpha\b', r'\.isalnum\b',
r'\.isupper\b', r'\.islower\b', r'\.isspace\b',
r'\.format\b', r'\.encode\b', r'\.decode\b',
# string operations
r'\[\s*-?\d*\s*:\s*-?\d*\s*(:\s*-?\d*\s*)?\]', # slicing [0:6], [-5:], [0:-1:2]
# format specifiers (alignment/fill)
r'\{[^}]*:[^}]*[<>^][^}]*\}', # {x:>4}, {x:*<4}
# f-string / format() calls
r'\bf"[^"]*"', r'\bf\'[^\']*\'',
],
"functions": [
r'\bfunction', # matches "function" and "functions"
r'\bparameter\b', r'\bargument\b',
r'\bdef\s+\w+', r'\breturn\b',
r'\bscope\b', r'\blocal\b', r'\bglobal\b',
r'\blambda\b', r'\brecursion\b', r'\brecursive\b',
r'\bdocstring',
],
"files": [
r'\bfile\b', r'\bfilename\b',
r'\bopen\s*\(', r'\.read\b', r'\.readline\b', r'\.readlines\b',
r'\.write\b', r'\.writelines\b', r'\.close\b',
r'\bwith\s+open\b',
r"['\"][rwa]b?['\"]", # file modes 'r', 'w', 'a', 'rb'
r'\bimport\s+csv\b', r'\bimport\s+json\b',
r'\bcsv\.\w+', r'\bjson\.\w+',
r'\bflush\b', r'\bbuffer\b',
r'\bbytes\s*\(', r'\bbytes\s+literal', # bytes() calls
],
"lists": [
r'\blist\b', r'\blist\s*\(',
r'\.append\b', r'\.extend\b', r'\.insert\b',
r'\.pop\b', r'\.remove\b', r'\.sort\b', r'\.reverse\b',
r'\bsorted\b', r'\blen\b',
r'\bfor\b.*\bin\b', # iteration
r'\[.*\bfor\b.*\bin\b', # list comprehension
r'\ball\s*\(', r'\bany\s*\(', # list aggregate builtins
r'\bmin\s*\(', r'\bmax\s*\(',
r'\w_list\b', # variable names like new_list, my_list
],
"dictionaries": [
r'\bdict\b', r'\bdictionar',
r'\.keys\b', r'\.values\b', r'\.items\b',
r'\.get\b', r'\.update\b',
r'\{\s*["\'\w]+\s*:\s*[^<>=*]', # dict literal {key: val} (exclude format specs)
],
"modules": [
r'\bmodule\b', r'\bpackage\b', r'\blibrary\b',
r'\bimport\s+\w+', r'\bfrom\s+\w+\s+import\b',
r'\bsys\b', r'\bos\b', r'\bmath\b',
r'\bsys\.argv\b', r'\b__name__\b',
r'\bcommand.line\s+argument',
r'\bpip\b',
r'\brandom\.\w+', # random.randrange, random.randint
r'\bnp\.\w+', r'\bnumpy\b', r'\bpandas\b', # numpy/pandas
r'\bstruct\.\w+', # struct.pack, struct.unpack
],
"classes": [
r'\bclass\s+\w+', r'\bself\.',
r'\b__init__\b', r'\binherit', r'\bsubclass\b',
r'\bobject\b', r'\binstance\b', r'\bconstructor\b',
r'\battribute\b', r'\bmethod\b',
r'\bsuper\s*\(',
r'\b__\w+__\s*\(', # dunder method calls __mul__(, __mod__(
r'\bisinstance\s*\(',
r'\boverload',
],
}
def _detect_topic(stem_blocks, choices):
"""Classify a question into a topic based on stem and choice content."""
# Combine all textual content for scanning
parts = []
for b in stem_blocks:
parts.append(b.get("text", ""))
for c in choices:
parts.append(c.get("text", ""))
blob = "\n".join(parts)
scores = {}
for topic, patterns in _TOPIC_INDICATORS.items():
scores[topic] = sum(
len(re.findall(p, blob, re.IGNORECASE)) for p in patterns
)
best = max(scores, key=scores.get)
return best if scores[best] > 0 else "other"
def _looks_like_code(text):
"""Heuristic: return True if *text* looks like a code expression.
Checks for common Python syntax indicators such as format specifiers,
method calls, brackets, assignment operators, etc.
"""
indicators = [
r'\{.*[:!].*\}', # format specifiers {x:>4}, {x!r}
r'\w+\.\w+\(', # method calls foo.bar(
r'\w+\(', # function calls upper(), len(
r'\w+\[', # subscript arr[
r'\w+\s*[=!<>]=', # comparison/assign x == , x !=
r'\breturn\b|\bprint\b|\bimport\b', # unambiguous keywords
r'\bTrue\b|\bFalse\b|\bNone\b', # builtins
# compound-statement keywords only when they look like Python
# statements (line ends with a colon)
r'(?m)^\s*(?:class|def|if|elif|else|for|while|with|try|except|finally)\b.*:\s*$',
]
return any(re.search(p, text) for p in indicators)
def _wrap_inline_code(text):
"""Wrap string literals and Python identifiers in ``backtick`` markers.
Detects double-quoted string literals and underscore-containing words
(e.g. ``new_string``, ``my_list``) that are almost certainly Python
identifiers. Skips regions already inside ``...`` backtick markers.
"""
# Split on ``...`` regions, only transform the outside parts
parts = re.split(r'(``.*?``)', text)
for i, part in enumerate(parts):
if not part.startswith('``'):
# Wrap double-quoted string literals
part = re.sub(r'"([^"]+)"', r'``"\1"``', part)
# Wrap underscore identifiers (e.g. new_string, my_list)
part = re.sub(r'\b(\w+_\w+)\b', r'``\1``', part)
parts[i] = part
return "".join(parts)
def _table_to_text(table_elem):
"""Convert a QTI ``<table>`` element into aligned plain-text rows.
Returns a string that resembles ``pandas`` DataFrame output, suitable
for inclusion as a ``code`` stem block.
"""
ns = _QTI_NS
rows = []
for tr in table_elem.iter(f"{{{ns}}}tr"):
cells = []
for td in tr.iter(f"{{{ns}}}td"):
# A cell might contain nested <p> or bare text
text = _element_text(td).strip()
cells.append(text)
rows.append(cells)
if not rows:
return ""
# Pad all rows to the same width
max_cols = max(len(r) for r in rows)
for r in rows:
while len(r) < max_cols:
r.append("")
# Column widths
col_widths = [max(len(r[c]) for r in rows) for c in range(max_cols)]
lines = []
for r in rows:
line = " ".join(r[c].rjust(col_widths[c]) for c in range(max_cols))
lines.append(line)
return "\n".join(lines)
def _element_text(elem, *, inline_code=False):
"""Recursively extract text from an XML element, converting <br/> to newlines.
Parameters
----------
inline_code : bool
If True, wrap ``<code>`` content with double-backtick markers
(``...``) so the downstream LaTeX renderer can typeset them
with ``\\inl{}``.
"""
_BLOCK_TAGS = {"p", "div", "pre"}
parts = []
if elem.text:
parts.append(elem.text)
for child in elem:
local = child.tag.split("}")[-1] if "}" in child.tag else child.tag
if local == "br":
parts.append("\n")
elif local == "code" and inline_code:
code_text = _element_text(child)
parts.append(f"``{code_text.strip()}``")
else:
# Insert a newline before block-level elements when content precedes them
if local in _BLOCK_TAGS and parts and not parts[-1].endswith("\n"):
parts.append("\n")
parts.append(_element_text(child, inline_code=inline_code))
if child.tail:
parts.append(child.tail)
return "".join(parts)
[docs]
def parse_qti_item(xml_path):
"""Parse a single QTI v2.1 assessmentItem XML file into a question dict.
Returns None if the file is not a valid assessmentItem (e.g. manifest).
"""
tree = ET.parse(xml_path)
root = tree.getroot()
# Skip files that aren't assessmentItem elements
root_local = root.tag.split("}")[-1] if "}" in root.tag else root.tag
if root_local != "assessmentItem":
return None
qid = root.get("identifier", Path(xml_path).stem)
# --- correct answer ---
correct = None
val = root.find(f"{_qti('responseDeclaration')}/{_qti('correctResponse')}/{_qti('value')}")
if val is not None and val.text:
correct = val.text.strip().lower()
# --- item body ---
item_body = root.find(_qti("itemBody"))
if item_body is None:
print(f" WARNING: {Path(xml_path).name} has no itemBody, skipping")
return None
stem_blocks = []
choices = []
# Stem: <p> and <img> elements inside top-level <div> (not inside choiceInteraction)
for div in item_body.findall(_qti("div")):
for child in div:
child_local = child.tag.split("}")[-1] if "}" in child.tag else child.tag
if child_local == "img":
# Image element — record as image stem block
src = child.get("src", "")
alt = child.get("alt", "")
width_attr = child.get("width", "")
stem_blocks.append({
"type": "image",
"src": src,
"alt": alt,
"width": width_attr,
})
continue
if child_local == "table":
table_text = _table_to_text(child)
if table_text:
stem_blocks.append({
"type": "code",
"language": "text",
"style": "mypython",
"text": table_text,
})
continue
if child_local != "p":
continue
p = child
code = p.find(_qti("code"))
if code is not None:
# Distinguish block code (<p> is *only* a code listing)
# from inline code (<code> mixed with surrounding text).
has_surrounding_text = bool(
(p.text and p.text.strip())
or (code.tail and code.tail.strip())
)
if has_surrounding_text:
# Inline code — render as text with ``backtick`` markers
text = _wrap_inline_code(_element_text(p, inline_code=True).strip())
if text:
stem_blocks.append({"type": "text", "text": text})
else:
# Block code listing
code_text = _collapse_blank_lines(_element_text(code).strip())
stem_blocks.append({
"type": "code",
"language": "python",
"style": "mypython",
"text": code_text,
})
else:
text = _wrap_inline_code(_element_text(p).strip())
if text:
stem_blocks.append({"type": "text", "text": text})
# Merge consecutive text blocks where the second is a sentence
# continuation (ZyBooks splits <p> at inline-element boundaries).
merged = []
for block in stem_blocks:
if (merged
and merged[-1].get("type") == "text"
and block.get("type") == "text"):
text = block["text"]
if text and (text[0].islower() or text[0] in ',.?!;:'):
if text[0] in ',.?!;:':
merged[-1]["text"] += text
else:
merged[-1]["text"] += " " + text
continue
merged.append(block)
stem_blocks = merged
# Determine if the stem implies choices are code/output
stem_text_joined = " ".join(
b["text"] for b in stem_blocks if b.get("type") == "text"
).lower()
stem_implies_code = bool(re.search(
r"what is (the )?(output|result|value)"
r"|what does .* (print|output|return)"
r"|what will .* (print|output|display)"
r"|complete the (following )?code"
r"|which .* (line|statement|expression|code|method)"
r"|presentation type"
r"|as output",
stem_text_joined,
))
# Choices
ci = item_body.find(_qti("choiceInteraction"))
if ci is not None:
for sc in ci.findall(_qti("simpleChoice")):
key = sc.get("identifier", "").lower()
# Check if this individual choice contains <code>
has_code_elem = sc.find(f".//{_qti('code')}") is not None
text = _element_text(sc).strip()
if has_code_elem:
text = _collapse_blank_lines(text)
choice = {"key": key, "text": text}
if has_code_elem or stem_implies_code or _looks_like_code(text):
choice["type"] = "code"
choices.append(choice)
topic = _detect_topic(stem_blocks, choices)
return {
"id": qid,
"points": 1,
"type": "mcq",
"topic": topic,
"stem": stem_blocks,
"choices": choices,
"correct": correct,
}
def _numeric_sort_key(path):
"""Extract the trailing integer from a filename for numeric sorting.
E.g. ``qti_item_VE_IP_42.xml`` -> 42. Files without a number sort last.
"""
m = re.search(r'(\d+)\.xml$', path.name)
return int(m.group(1)) if m else float('inf')
[docs]
def parse_qti_items_dir(items_dir):
"""Parse all QTI XML files in a directory, returning a numerically sorted list."""
xml_files = sorted(items_dir.glob("*.xml"), key=_numeric_sort_key)
if not xml_files:
raise FileNotFoundError(f"No .xml files found in {items_dir}")
questions = []
for xf in xml_files:
q = parse_qti_item(xf)
if q is not None:
questions.append(q)
# Assign zero-padded sequential IDs
width = len(str(len(questions)))
for i, q in enumerate(questions, 1):
q["id"] = f"Q{i:0{width}d}"
print(f"Parsed {len(questions)} QTI items from {items_dir} ({len(xml_files) - len(questions)} skipped)")
return questions
# ---------- Main entry points ----------
[docs]
def convert(zip_path, output_yaml_path):
"""Convert a ZyBooks-exported zip into a YAML question bank.
Auto-detects the format: if the zip contains an ``items/`` directory with
XML files, the QTI parser is used. Otherwise, falls back to the Word
document parser.
Parameters
----------
zip_path : str or Path
Path to the exported zip file.
output_yaml_path : str or Path
Path to the output YAML file.
"""
zip_path = Path(zip_path)
output_yaml_path = Path(output_yaml_path)
if not zip_path.is_file():
raise FileNotFoundError(f"{zip_path} does not exist or is not a file.")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir_path = Path(tmpdir)
with zipfile.ZipFile(zip_path, 'r') as zf:
zf.extractall(tmpdir_path)
items_dir = tmpdir_path / "items"
if items_dir.is_dir() and list(items_dir.glob("*.xml")):
questions = parse_qti_items_dir(items_dir)
else:
questions_doc, answer_key_doc = find_docs_in_extracted_dir(tmpdir_path)
print(f"Using questions doc: {questions_doc.name}")
print(f"Using answer key doc: {answer_key_doc.name}")
answers = parse_answer_key(answer_key_doc)
questions = parse_questions(questions_doc, answers)
# Collect image references and copy files alongside the YAML
images_dir = output_yaml_path.parent / "images"
image_count = 0
for q in questions:
for block in q.get("stem", []):
if block.get("type") == "image":
src = block["src"]
src_path = items_dir / src if items_dir.is_dir() else None
if src_path and src_path.is_file():
images_dir.mkdir(parents=True, exist_ok=True)
shutil.copy2(src_path, images_dir / src)
image_count += 1
else:
print(f" WARNING: image {src} referenced but not found in zip")
data = {"questions": questions}
with output_yaml_path.open("w", encoding="utf-8") as f:
yaml.safe_dump(
data,
f,
sort_keys=False,
allow_unicode=True,
width=80,
)
print(f"Wrote {len(questions)} questions to {output_yaml_path}")
if image_count:
print(f"Copied {image_count} image(s) to {images_dir}")
[docs]
def convert_subcommand(args):
"""CLI subcommand wrapper for :func:`convert`."""
convert(args.input_zip, args.output_yaml)
return 0
[docs]
def main():
if len(sys.argv) != 3:
print("Usage: python zybooks_zip_to_yaml.py export.zip output.yaml")
sys.exit(1)
convert(sys.argv[1], sys.argv[2])
if __name__ == "__main__":
main()