- Rename skill/ to skills/ for consistency with naming conventions - Rename agent/ to agents/ and command/ to commands/ - Update AGENTS.md with all directory references - Update scripts/test-skill.sh paths - Update prompts/athena.txt documentation This aligns with best practices of using plural directory names and updates all documentation to reflect the new structure.
153 lines
6.0 KiB
Python
153 lines
6.0 KiB
Python
import json
|
|
import sys
|
|
|
|
from pypdf import PdfReader
|
|
|
|
|
|
# Extracts data for the fillable form fields in a PDF and outputs JSON that
|
|
# The Coding Agent uses to fill the fields. See forms.md.
|
|
|
|
|
|
# This matches the format used by PdfReader `get_fields` and `update_page_form_field_values` methods.
|
|
def get_full_annotation_field_id(annotation):
|
|
components = []
|
|
while annotation:
|
|
field_name = annotation.get('/T')
|
|
if field_name:
|
|
components.append(field_name)
|
|
annotation = annotation.get('/Parent')
|
|
return ".".join(reversed(components)) if components else None
|
|
|
|
|
|
def make_field_dict(field, field_id):
|
|
field_dict = {"field_id": field_id}
|
|
ft = field.get('/FT')
|
|
if ft == "/Tx":
|
|
field_dict["type"] = "text"
|
|
elif ft == "/Btn":
|
|
field_dict["type"] = "checkbox" # radio groups handled separately
|
|
states = field.get("/_States_", [])
|
|
if len(states) == 2:
|
|
# "/Off" seems to always be the unchecked value, as suggested by
|
|
# https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448
|
|
# It can be either first or second in the "/_States_" list.
|
|
if "/Off" in states:
|
|
field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
|
|
field_dict["unchecked_value"] = "/Off"
|
|
else:
|
|
print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.")
|
|
field_dict["checked_value"] = states[0]
|
|
field_dict["unchecked_value"] = states[1]
|
|
elif ft == "/Ch":
|
|
field_dict["type"] = "choice"
|
|
states = field.get("/_States_", [])
|
|
field_dict["choice_options"] = [{
|
|
"value": state[0],
|
|
"text": state[1],
|
|
} for state in states]
|
|
else:
|
|
field_dict["type"] = f"unknown ({ft})"
|
|
return field_dict
|
|
|
|
|
|
# Returns a list of fillable PDF fields:
|
|
# [
|
|
# {
|
|
# "field_id": "name",
|
|
# "page": 1,
|
|
# "type": ("text", "checkbox", "radio_group", or "choice")
|
|
# // Per-type additional fields described in forms.md
|
|
# },
|
|
# ]
|
|
def get_field_info(reader: PdfReader):
|
|
fields = reader.get_fields()
|
|
|
|
field_info_by_id = {}
|
|
possible_radio_names = set()
|
|
|
|
for field_id, field in fields.items():
|
|
# Skip if this is a container field with children, except that it might be
|
|
# a parent group for radio button options.
|
|
if field.get("/Kids"):
|
|
if field.get("/FT") == "/Btn":
|
|
possible_radio_names.add(field_id)
|
|
continue
|
|
field_info_by_id[field_id] = make_field_dict(field, field_id)
|
|
|
|
# Bounding rects are stored in annotations in page objects.
|
|
|
|
# Radio button options have a separate annotation for each choice;
|
|
# all choices have the same field name.
|
|
# See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html
|
|
radio_fields_by_id = {}
|
|
|
|
for page_index, page in enumerate(reader.pages):
|
|
annotations = page.get('/Annots', [])
|
|
for ann in annotations:
|
|
field_id = get_full_annotation_field_id(ann)
|
|
if field_id in field_info_by_id:
|
|
field_info_by_id[field_id]["page"] = page_index + 1
|
|
field_info_by_id[field_id]["rect"] = ann.get('/Rect')
|
|
elif field_id in possible_radio_names:
|
|
try:
|
|
# ann['/AP']['/N'] should have two items. One of them is '/Off',
|
|
# the other is the active value.
|
|
on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
|
|
except KeyError:
|
|
continue
|
|
if len(on_values) == 1:
|
|
rect = ann.get("/Rect")
|
|
if field_id not in radio_fields_by_id:
|
|
radio_fields_by_id[field_id] = {
|
|
"field_id": field_id,
|
|
"type": "radio_group",
|
|
"page": page_index + 1,
|
|
"radio_options": [],
|
|
}
|
|
# Note: at least on macOS 15.7, Preview.app doesn't show selected
|
|
# radio buttons correctly. (It does if you remove the leading slash
|
|
# from the value, but that causes them not to appear correctly in
|
|
# Chrome/Firefox/Acrobat/etc).
|
|
radio_fields_by_id[field_id]["radio_options"].append({
|
|
"value": on_values[0],
|
|
"rect": rect,
|
|
})
|
|
|
|
# Some PDFs have form field definitions without corresponding annotations,
|
|
# so we can't tell where they are. Ignore these fields for now.
|
|
fields_with_location = []
|
|
for field_info in field_info_by_id.values():
|
|
if "page" in field_info:
|
|
fields_with_location.append(field_info)
|
|
else:
|
|
print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
|
|
|
|
# Sort by page number, then Y position (flipped in PDF coordinate system), then X.
|
|
def sort_key(f):
|
|
if "radio_options" in f:
|
|
rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]
|
|
else:
|
|
rect = f.get("rect") or [0, 0, 0, 0]
|
|
adjusted_position = [-rect[1], rect[0]]
|
|
return [f.get("page"), adjusted_position]
|
|
|
|
sorted_fields = fields_with_location + list(radio_fields_by_id.values())
|
|
sorted_fields.sort(key=sort_key)
|
|
|
|
return sorted_fields
|
|
|
|
|
|
def write_field_info(pdf_path: str, json_output_path: str):
|
|
reader = PdfReader(pdf_path)
|
|
field_info = get_field_info(reader)
|
|
with open(json_output_path, "w") as f:
|
|
json.dump(field_info, f, indent=2)
|
|
print(f"Wrote {len(field_info)} fields to {json_output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 3:
|
|
print("Usage: extract_form_field_info.py [input pdf] [output json]")
|
|
sys.exit(1)
|
|
write_field_info(sys.argv[1], sys.argv[2])
|