picopaper/picopaper.py

#!/usr/bin/env python3

import os
import re
from datetime import datetime
from pathlib import Path
from jinja2 import Environment, FileSystemLoader
import markdown
from config import (BLOG_TITLE, BLOG_DESCRIPTION, THEME, EXCLUDE_FEEDS_FROM_MAIN,
                    NAVBAR_ITEMS, HIDE_LOGO, HIDE_TITLE, LOGO_PATH,
                    ENABLE_RSS_FEED, RSS_FEED_PATH,
                    BASE_URL, AUTHOR_NAME, AUTHOR_EMAIL, FEED_MAX_ITEMS)

class SSGGGenerator:
    def __init__(self, items_dir='items', output_dir='output', theme=None, blog_title=None, blog_description=None):
        self.items_dir = Path(items_dir)
        self.output_dir = Path(output_dir)
        self.theme = theme or THEME
        self.theme_dir = Path('theme') / self.theme
        self.templates_dir = self.theme_dir / 'templates'
        self.assets_dir = self.theme_dir / 'assets'
        self.blog_title = blog_title or BLOG_TITLE
        self.blog_description = blog_description or BLOG_DESCRIPTION
        self.exclude_feeds = EXCLUDE_FEEDS_FROM_MAIN
        self.navbar_items = NAVBAR_ITEMS
        self.hide_logo = HIDE_LOGO
        self.hide_title = HIDE_TITLE
        self.logo_path = LOGO_PATH

        # Setup Jinja2
        self.env = Environment(loader=FileSystemLoader(self.templates_dir))

        # Add custom filter for random sampling
        def random_sample(items, count):
            import random
            items_list = list(items)
            return random.sample(items_list, min(count, len(items_list)))

        self.env.filters['random_sample'] = random_sample

        # Setup markdown with toc extension for header anchors
        self.md = markdown.Markdown(extensions=['extra', 'toc'])

    def parse_filename(self, filename):
        """Parse filename format: YYYY-MM-DD_type_name[_feed].md"""
        pattern = r'(\d{4}-\d{2}-\d{2})_(short|long|page)_(.+?)(?:_([a-z0-9-]+))?\.md'
        match = re.match(pattern, filename)

        if not match:
            return None

        date_str, post_type, name, feed = match.groups()
        date = datetime.strptime(date_str, '%Y-%m-%d')

        return {
            'date': date,
            'date_str': date.strftime('%Y-%m-%d'),
            'type': post_type,
            'name': name,
            'feed': feed,
            'filename': filename
        }

    def add_header_anchors(self, html_content):
        """Add anchor links to headers with IDs"""
        # Pattern to match headers with id attributes: <h2 id="some-id">Text</h2>
        def replace_header(match):
            tag = match.group(1)
            header_id = match.group(2)
            text = match.group(3)
            # Add anchor link with # symbol
            return f'<{tag} id="{header_id}">{text} <a href="#{header_id}" class="header-anchor">#</a></{tag}>'

        # Match h2-h6 tags with id attributes
        pattern = r'<(h[2-6]) id="([^"]+)">([^<]+)<\/\1>'
        return re.sub(pattern, replace_header, html_content)

    def read_post(self, filepath):
        """Read markdown file and extract title and content"""
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        # Extract title (first # heading)
        title_match = re.match(r'^#\s+(.+)$', content, re.MULTILINE)
        title = title_match.group(1) if title_match else 'Untitled'

        # Remove title from content
        if title_match:
            content = content[title_match.end():].strip()

        # Convert markdown to HTML
        html_content = self.md.convert(content)

        # Add anchor links to headers
        html_content = self.add_header_anchors(html_content)

        return title, html_content

    def collect_posts(self):
        """Collect and parse all posts from items directory"""
        posts = []

        if not self.items_dir.exists():
            print(f"Warning: {self.items_dir} does not exist")
            return posts

        for filepath in self.items_dir.glob('*.md'):
            parsed = self.parse_filename(filepath.name)

            if not parsed:
                print(f"Skipping {filepath.name}: doesn't match naming convention")
                continue

            title, content = self.read_post(filepath)

            post = {
                'date': parsed['date_str'],
                'type': parsed['type'],
                'name': parsed['name'],
                'title': title,
                'content': content,
                'slug': parsed['name'],
                'url': f"/{parsed['name']}/",
                'feed': parsed['feed'],
                'source': filepath.name
            }

            posts.append(post)

        # Sort by date, newest first
        posts.sort(key=lambda x: x['date'], reverse=True)

        return posts

    def generate_index(self, posts, feed_name=None, all_posts=None):
        """Generate index.html with all posts (or feed-specific index)"""
        template = self.env.get_template('index.tmpl')

        if feed_name:
            title = f"{feed_name} - {self.blog_title}"
            output_path = self.output_dir / 'feed' / feed_name / 'index.html'
        else:
            title = self.blog_title
            output_path = self.output_dir / 'index.html'

        html = template.render(
            title=title,
            blog_title=self.blog_title,
            blog_description=self.blog_description,
            navbar_items=self.navbar_items,
            posts=posts,
            all_posts=all_posts or posts,
            hide_logo=self.hide_logo,
            hide_title=self.hide_title,
            logo_path=self.logo_path,
            rss_feed_enabled=ENABLE_RSS_FEED,
            rss_feed_path=RSS_FEED_PATH
        )

        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html)

        print(f"✓ Generated {output_path}")

    def generate_feeds_overview(self, feeds, all_posts=None):
        """Generate /feed/index.html with list of all non-excluded feeds"""
        template = self.env.get_template('feeds.tmpl')

        # Prepare feed data with counts, excluding feeds in EXCLUDE_FEEDS_FROM_MAIN
        feed_list = []
        for feed_name, posts in sorted(feeds.items()):
            if feed_name not in self.exclude_feeds:
                feed_list.append({
                    'name': feed_name,
                    'count': len(posts)
                })

        title = f"Feeds - {self.blog_title}"
        output_path = self.output_dir / 'feed' / 'index.html'

        html = template.render(
            title=title,
            blog_title=self.blog_title,
            blog_description=self.blog_description,
            navbar_items=self.navbar_items,
            feeds=feed_list,
            all_posts=all_posts or [],
            hide_logo=self.hide_logo,
            hide_title=self.hide_title,
            logo_path=self.logo_path,
            rss_feed_enabled=ENABLE_RSS_FEED,
            rss_feed_path=RSS_FEED_PATH
        )

        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html)

        print(f"✓ Generated {output_path}")

    def generate_post_page(self, post, all_posts=None):
        """Generate individual post page for 'long' posts"""
        template = self.env.get_template('post.tmpl')

        html = template.render(
            title=f"{post['title']} - {self.blog_title}",
            blog_title=self.blog_title,
            blog_description=self.blog_description,
            navbar_items=self.navbar_items,
            post=post,
            all_posts=all_posts or [],
            hide_logo=self.hide_logo,
            hide_title=self.hide_title,
            logo_path=self.logo_path,
            rss_feed_enabled=ENABLE_RSS_FEED,
            rss_feed_path=RSS_FEED_PATH
        )

        # Create directory for the post slug
        post_dir = self.output_dir / post['slug']
        post_dir.mkdir(exist_ok=True)

        # Generate index.html inside the slug directory
        output_path = post_dir / 'index.html'
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html)

        print(f"✓ Generated {output_path}")

    def generate_rss_feed(self, posts):
        """Generate RSS 2.0 feed for main feed posts"""
        from xml.etree.ElementTree import Element, SubElement, tostring, register_namespace
        from xml.dom import minidom
        import re

        # Register atom namespace to avoid ns0 prefix
        register_namespace('atom', 'http://www.w3.org/2005/Atom')

        # Limit posts
        posts = posts[:FEED_MAX_ITEMS]

        # Build feed URL correctly - ensure no double slashes
        feed_path = RSS_FEED_PATH.lstrip('/')
        # Remove trailing slash from BASE_URL if present for clean URL construction
        base_url_clean = BASE_URL.rstrip('/')
        feed_url = f"{base_url_clean}/{feed_path}"

        # Create RSS element (namespace will be added automatically when we use atom:link)
        rss = Element('rss', version='2.0')
        channel = SubElement(rss, 'channel')

        # Channel metadata
        SubElement(channel, 'title').text = self.blog_title
        SubElement(channel, 'description').text = self.blog_description
        SubElement(channel, 'link').text = base_url_clean

        # Add atom:link with rel="self" (required by RSS best practices)
        atom_link = SubElement(channel, '{http://www.w3.org/2005/Atom}link')
        atom_link.set('href', feed_url)
        atom_link.set('rel', 'self')
        atom_link.set('type', 'application/rss+xml')

        SubElement(channel, 'lastBuildDate').text = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +0000')

        # Add author information (managingEditor format: email (name))
        if AUTHOR_EMAIL and AUTHOR_NAME:
            SubElement(channel, 'managingEditor').text = f"{AUTHOR_EMAIL} ({AUTHOR_NAME})"
        elif AUTHOR_EMAIL:
            SubElement(channel, 'managingEditor').text = AUTHOR_EMAIL

        # Helper function to convert relative URLs to absolute
        def make_absolute_urls(html_content):
            # Replace relative URLs with absolute ones
            html_content = re.sub(r'href="/', f'href="{base_url_clean}/', html_content)
            html_content = re.sub(r'src="/', f'src="{base_url_clean}/', html_content)
            return html_content

        # Add items
        for post in posts:
            item = SubElement(channel, 'item')
            SubElement(item, 'title').text = post['title']
            SubElement(item, 'link').text = f"{base_url_clean}{post['url']}"
            SubElement(item, 'guid', isPermaLink='true').text = f"{base_url_clean}{post['url']}"
            SubElement(item, 'pubDate').text = datetime.strptime(post['date'], '%Y-%m-%d').strftime('%a, %d %b %Y 00:00:00 +0000')

            # Content type based on post type
            if post['type'] == 'long':
                # For long posts, just show title/summary
                SubElement(item, 'description').text = f"Read more at {base_url_clean}{post['url']}"
            else:
                # For short posts, include full content with absolute URLs
                content_absolute = make_absolute_urls(post['content'])
                SubElement(item, 'description').text = content_absolute

        # Pretty print XML
        xml_str = minidom.parseString(tostring(rss, encoding='utf-8')).toprettyxml(indent='  ', encoding='utf-8')

        # Write to file
        output_path = self.output_dir / RSS_FEED_PATH
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'wb') as f:
            f.write(xml_str)

        print(f"✓ Generated {output_path}")

    def copy_assets(self):
        """Copy theme assets and images to output directory"""
        import shutil

        # Copy theme assets
        if self.assets_dir.exists():
            dest_dir = self.output_dir / 'assets'
            if dest_dir.exists():
                shutil.rmtree(dest_dir)
            shutil.copytree(self.assets_dir, dest_dir)
            print(f"✓ Copied theme assets to output")

        # Copy images
        images_dir = Path('images')
        if images_dir.exists():
            dest_dir = self.output_dir / 'images'
            if dest_dir.exists():
                shutil.rmtree(dest_dir)
            shutil.copytree(images_dir, dest_dir)
            print(f"✓ Copied images/ to output")

        # Copy static files (GPG keys, .well-known, etc.)
        static_dir = Path('static')
        if static_dir.exists():
            for item in static_dir.rglob('*'):
                if item.is_file():
                    # Preserve directory structure
                    rel_path = item.relative_to(static_dir)
                    dest_path = self.output_dir / rel_path
                    dest_path.parent.mkdir(parents=True, exist_ok=True)
                    shutil.copy2(item, dest_path)
            print(f"✓ Copied static/ to output")

    def generate(self):
        """Main generation process"""
        print(f"Starting picopaper generation with theme '{self.theme}'...")

        # Create output directory
        self.output_dir.mkdir(exist_ok=True)

        # Collect posts
        all_posts = self.collect_posts()
        print(f"Found {len(all_posts)} posts")

        # Filter out pages and excluded feeds from main feed
        feed_posts = [p for p in all_posts
                      if p['type'] != 'page'
                      and p['feed'] not in self.exclude_feeds]

        # Generate main index with filtered feed posts
        self.generate_index(feed_posts, all_posts=feed_posts)

        # Group posts by feed (include all posts, not just those in main feed)
        feeds = {}
        for post in all_posts:
            if post['feed'] and post['type'] != 'page':
                feeds.setdefault(post['feed'], []).append(post)

        # Generate feed-specific pages
        for feed_name, posts in feeds.items():
            self.generate_index(posts, feed_name, all_posts=feed_posts)

        # Generate feeds overview page
        if feeds:
            self.generate_feeds_overview(feeds, all_posts=feed_posts)

        # Generate individual pages for long posts, short posts, and pages
        for post in all_posts:
            if post['type'] in ['long', 'short', 'page']:
                self.generate_post_page(post, all_posts=feed_posts)

        # Generate RSS feed
        if ENABLE_RSS_FEED:
            self.generate_rss_feed(feed_posts)

        # Copy assets
        self.copy_assets()

        print(f"\n✓ Site generated successfully in {self.output_dir}/")

def main():
    generator = SSGGGenerator()
    generator.generate()

if __name__ == '__main__':
    main()