feat(basecamp-project): add markdown to trix converter

- Add markdown_to_trix.py with Markdown→HTML conversion for Basecamp - Implement is_basecamp_safe() for unsupported feature detection - Add convert_table_to_lists() for table→list conversion - Include CLI with --file, --output, and --check modes - Add comprehensive self-tests for all functions - Add requirements.txt with markdown dependency
2026-04-27 09:52:21 +02:00
parent a88e6f9f06
commit 30602ca8c6
2 changed files with 483 additions and 0 deletions
@@ -0,0 +1,481 @@
 #!/usr/bin/env python3
 """
 Markdown to Basecamp Trix HTML Converter.
 Converts Markdown text to Basecamp-compatible HTML in Trix format.
 Handles Basecamp-specific limitations and provides safety checks.
 """
 import argparse
 import logging
 import re
 import sys
 from typing import Optional
 try:
    import markdown as markdown_lib
    MARKDOWN_AVAILABLE = True
 except ImportError:
    MARKDOWN_AVAILABLE = False
    markdown_lib = None  # type: ignore[assignment-assign]
 logging.basicConfig(
    level=logging.DEBUG,
    format="%(levelname)s: %(message)s"
 )
 logger = logging.getLogger(__name__)
 def markdown_to_trix(markdown_text: str) -> str:
    """
    Convert Markdown text to Basecamp-compatible HTML (Trix format).
    Args:
        markdown_text: The Markdown content to convert.
    Returns:
        HTML string compatible with Basecamp Trix editor.
    Raises:
        ImportError: If markdown library is not installed.
    """
    if not MARKDOWN_AVAILABLE:
        raise ImportError(
            "The 'markdown' library is required. Install with: pip install markdown"
        )
    # Strip potential HTML comments or existing HTML that might cause issues
    cleaned_text = _preprocess_markdown(markdown_text)
    # Configure markdown with safe extensions
    md = markdown_lib.Markdown(  # type: ignore[attr-defined]
        extensions=[
            "tables",
            "fenced_code",
            "codehilite",
            "nl2br",
            "sane_lists",
        ],
        extension_configs={
            "codehilite": {"css_class": "highlight"},
        },
        output_format="html",
    )
    # Convert markdown to HTML
    html = md.convert(cleaned_text)
    # Post-process for Basecamp compatibility
    html = _post_process_html(html)
    return html
 def _preprocess_markdown(text: str) -> str:
    """Preprocess markdown text before conversion."""
    lines = text.split("\n")
    processed_lines = []
    for line in lines:
        # Preserve checkboxes but mark them for post-processing
        # Basecamp supports checkboxes only in to-dos, not messages
        if re.match(r"^(\s*)-\s*\[[\sXx]\]\s+", line):
            # Mark checkbox lines for later handling
            processed_lines.append(line)
        else:
            processed_lines.append(line)
    return "\n".join(processed_lines)
 def _post_process_html(html: str) -> str:
    """
    Post-process HTML for Basecamp Trix compatibility.
    - Removes horizontal rules or converts them to styled divs
    - Cleans up unnecessary tags
    - Ensures proper inline formatting
    """
    # Remove empty paragraphs
    html = re.sub(r"<p>\s*</p>", "", html)
    # Convert horizontal rules to Basecamp-friendly separators
    # Basecamp may not render <hr> correctly, use styled div instead
    html = re.sub(
        r"<hr\s*/?>",
        '<div style="border-bottom: 1px solid #e0e0e0; margin: 16px 0;"></div>',
        html,
    )
    # Clean up multiple consecutive breaks
    html = re.sub(r"(<br\s*/?>){3,}", "<br><br>", html)
    # Ensure links open in new tab (Basecamp best practice)
    html = re.sub(
        r'<a href="([^"]+)"',
        r'<a href="\1" target="_blank" rel="noopener noreferrer"',
        html,
    )
    # Wrap code blocks properly for Basecamp
    html = re.sub(
        r'<pre><code class="([^"]*)">',
        r'<pre class="\1"><code>',
        html,
    )
    return html
 def is_basecamp_safe(markdown_text: str) -> list[str]:
    """
    Check Markdown for unsupported Basecamp features.
    Args:
        markdown_text: The Markdown content to check.
    Returns:
        List of warning messages for unsupported features found.
    """
    warnings: list[str] = []
    # Check for tables
    if re.search(r"^\|.*\|.*$", markdown_text, re.MULTILINE):
        logger.debug("Found table syntax in markdown")
        warnings.append(
            "Tables not supported: convert to structured lists"
        )
    # Check for checkboxes (only in document context, not todo lists)
    if re.search(r"^(\s*)-\s*\[[\sXx]\]\s+", markdown_text, re.MULTILINE):
        logger.debug("Found checkbox syntax in markdown")
        warnings.append(
            "Checkboxes not supported in messages: create as todos instead"
        )
    # Check for horizontal rules
    if re.search(r"^[-*_]{3,}\s*$", markdown_text, re.MULTILINE):
        logger.debug("Found horizontal rule syntax")
        warnings.append(
            "Horizontal rules may not render correctly"
        )
    # Check for complex nested structures that might not render
    if markdown_text.count("    ") > 10:
        logger.debug("Found deeply nested indentation")
        warnings.append(
            "Deep nesting may not render correctly: consider flattening structure"
        )
    return warnings
 def convert_table_to_lists(table_md: str) -> str:
    """
    Convert a Markdown table to nested lists in Basecamp-friendly format.
    Args:
        table_md: The Markdown table content (without outer pipes).
    Returns:
        Nested list representation suitable for Basecamp.
    Example:
        Input:
            | Name | Value |
            |------|-------|
            | Foo  | Bar   |
        Output:
            Name:
              - Name: Foo
              - Value: Bar
    """
    lines = [line.strip() for line in table_md.strip().split("\n") if line.strip()]
    if len(lines) < 2:
        return table_md
    # Parse header
    header_match = re.match(r"\|(.+)\|", lines[0])
    if not header_match:
        return table_md
    headers = [h.strip() for h in header_match.group(1).split("|")]
    # Skip separator line if present
    data_lines = lines[1:]
    if data_lines and re.match(r"^\|[-:\s|]+\|$", data_lines[0]):
        data_lines = data_lines[1:]
    result_parts: list[str] = []
    for line in data_lines:
        row_match = re.match(r"\|(.+)\|", line)
        if not row_match:
            continue
        values = [v.strip() for v in row_match.group(1).split("|")]
        # Create nested list for each row
        row_items: list[str] = []
        for header, value in zip(headers, values):
            row_items.append(f"  - {header}: {value}")
        if row_items:
            result_parts.append("\n".join(row_items))
    return "\n\n".join(result_parts)
 def convert_checkbox_to_text(line: str) -> str:
    """
    Convert a checkbox line to plain text.
    Args:
        line: A markdown line with checkbox syntax.
    Returns:
        Plain text representation with checkbox state indicated.
    """
    # Match: optional_indent - [ ] or - [x] or - [X]
    match = re.match(r"^(\s*-\s*)\[[\sXx]\]\s+(.*)", line)
    if match:
        indent, text = match.groups()
        checkbox_char = "☐" if " " in match.group(0) else "☑"
        return f"{indent}{checkbox_char} {text}"
    return line
 def _cli_check_mode(filename: str) -> int:
    """
    Run in check mode: only output warnings without conversion.
    Args:
        filename: Path to the markdown file.
    Returns:
        Exit code: 0 if no warnings, 1 if warnings found, 2 on file error.
    """
    try:
        with open(filename, "r", encoding="utf-8") as f:
            markdown_text = f.read()
    except FileNotFoundError:
        print(f"Error: File not found: {filename}", file=sys.stderr)
        return 2
    except PermissionError:
        print(f"Error: Permission denied: {filename}", file=sys.stderr)
        return 2
    except Exception as e:
        print(f"Error reading file: {e}", file=sys.stderr)
        return 2
    warnings = is_basecamp_safe(markdown_text)
    if warnings:
        print(f"Warnings for {filename}:")
        for warning in warnings:
            print(f"  ⚠️  {warning}")
        return 1
    else:
        print(f"✅ No unsupported features found in {filename}")
        return 0
 def _cli_convert_mode(filename: str, output: Optional[str]) -> int:
    """
    Run in convert mode: convert markdown to HTML.
    Args:
        filename: Path to the markdown file.
        output: Optional path for output file.
    Returns:
        Exit code: 0 on success, 1 on warnings, 2 on error.
    """
    try:
        with open(filename, "r", encoding="utf-8") as f:
            markdown_text = f.read()
    except FileNotFoundError:
        print(f"Error: File not found: {filename}", file=sys.stderr)
        return 2
    except PermissionError:
        print(f"Error: Permission denied: {filename}", file=sys.stderr)
        return 2
    except Exception as e:
        print(f"Error reading file: {e}", file=sys.stderr)
        return 2
    # Show warnings before conversion
    warnings = is_basecamp_safe(markdown_text)
    if warnings:
        print(f"Warnings for {filename}:")
        for warning in warnings:
            print(f"  ⚠️  {warning}")
        print()
    # Check for markdown library
    if not MARKDOWN_AVAILABLE:
        print(
            "Error: markdown library not installed.",
            file=sys.stderr
        )
        print(
            "Install with: pip install markdown",
            file=sys.stderr
        )
        return 2
    try:
        html_output = markdown_to_trix(markdown_text)
    except Exception as e:
        print(f"Error converting markdown: {e}", file=sys.stderr)
        return 2
    if output:
        try:
            with open(output, "w", encoding="utf-8") as f:
                f.write(html_output)
            print(f"✅ Converted {filename} → {output}")
        except Exception as e:
            print(f"Error writing output: {e}", file=sys.stderr)
            return 2
    else:
        # Write to stdout
        print(html_output)
    return 1 if warnings else 0
 def main() -> int:
    """
    CLI entry point.
    Returns:
        Exit code: 0=OK, 1=Warnungen, 2=Fehler.
    """
    parser = argparse.ArgumentParser(
        prog="markdown_to_trix.py",
        description="Convert Markdown to Basecamp Trix HTML format.",
    )
    parser.add_argument(
        "--file",
        "-f",
        metavar="INPUT.md",
        help="Input Markdown file to convert",
    )
    parser.add_argument(
        "--output",
        "-o",
        metavar="OUTPUT.html",
        help="Output HTML file (default: stdout)",
    )
    parser.add_argument(
        "--check",
        "-c",
        metavar="INPUT.md",
        help="Only check for unsupported features, don't convert",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Enable verbose (debug) logging",
    )
    args = parser.parse_args()
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
    # Handle mutually exclusive modes
    if args.check:
        return _cli_check_mode(args.check)
    if args.file:
        return _cli_convert_mode(args.file, args.output)
    # No mode specified, show help
    parser.print_help()
    return 0
 if __name__ == "__main__":
    # Parse arguments first
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--check", "-c", metavar="INPUT.md")
    parser.add_argument("--file", "-f", metavar="INPUT.md")
    parser.add_argument("--output", "-o", metavar="OUTPUT.html")
    args, _ = parser.parse_known_args()
    if args.check or args.file:
        # CLI mode with arguments
        sys.exit(main())
    # Run self-tests when executed without arguments
    print("Running self-test...")
    test_cases = [
        {
            "name": "Basic conversion",
            "input": "# Hello\n\n**Bold** and *italic* text.",
            "expected_warnings": 0,
        },
        {
            "name": "Table detection",
            "input": "# Test\n\n| A | B |\n|---|---|\n| 1 | 2 |",
            "expected_warnings": 1,
        },
        {
            "name": "Checkbox detection",
            "input": "# Tasks\n\n- [ ] Task 1\n- [x] Task 2",
            "expected_warnings": 1,
        },
        {
            "name": "Horizontal rule",
            "input": "# Test\n\n---\n\nContent",
            "expected_warnings": 1,
        },
        {
            "name": "Safe markdown",
            "input": "# Title\n\n- Item 1\n- Item 2\n\n**Bold** and [link](https://example.com)",
            "expected_warnings": 0,
        },
    ]
    all_passed = True
    for test in test_cases:
        warnings = is_basecamp_safe(test["input"])
        passed = len(warnings) == test["expected_warnings"]
        status = "✅ PASS" if passed else "❌ FAIL"
        print(f"  {status}: {test['name']} (warnings: {len(warnings)})")
        if not passed:
            all_passed = False
            print(f"       Expected: {test['expected_warnings']}, Got: {len(warnings)}")
            print(f"       Warnings: {warnings}")
    # Test table conversion
    print("\nTesting table conversion...")
    table_md = """
 | Name | Role | Email |
 |------|------|-------|
 | Alice | Dev | alice@example.com |
 | Bob | Design | bob@example.com |
 """
    converted = convert_table_to_lists(table_md)
    print(f"  Input:\n{table_md}")
    print(f"  Output:\n{converted}")
    if all_passed:
        print("\n✅ All self-tests passed!")
        sys.exit(0)
    else:
        print("\n❌ Some self-tests failed!")
        sys.exit(1)
@@ -0,0 +1,2 @@
 # Markdown to Trix Converter Dependencies
 markdown>=3.4.0
		`@@ -0,0 +1,2 @@`
							`# Markdown to Trix Converter Dependencies`
							`markdown>=3.4.0`