picopaper/picopaper.py

392 lines
14 KiB
Python

#!/usr/bin/env python3
import os
import re
from datetime import datetime
from pathlib import Path
from jinja2 import Environment, FileSystemLoader
import markdown
from config import (BLOG_TITLE, BLOG_DESCRIPTION, THEME, EXCLUDE_FEEDS_FROM_MAIN,
NAVBAR_ITEMS, HIDE_LOGO, HIDE_TITLE, LOGO_PATH,
ENABLE_RSS_FEED, RSS_FEED_PATH,
BASE_URL, AUTHOR_NAME, AUTHOR_EMAIL, FEED_MAX_ITEMS)
class SSGGGenerator:
def __init__(self, items_dir='items', output_dir='output', theme=None, blog_title=None, blog_description=None):
self.items_dir = Path(items_dir)
self.output_dir = Path(output_dir)
self.theme = theme or THEME
self.theme_dir = Path('theme') / self.theme
self.templates_dir = self.theme_dir / 'templates'
self.assets_dir = self.theme_dir / 'assets'
self.blog_title = blog_title or BLOG_TITLE
self.blog_description = blog_description or BLOG_DESCRIPTION
self.exclude_feeds = EXCLUDE_FEEDS_FROM_MAIN
self.navbar_items = NAVBAR_ITEMS
self.hide_logo = HIDE_LOGO
self.hide_title = HIDE_TITLE
self.logo_path = LOGO_PATH
# Setup Jinja2
self.env = Environment(loader=FileSystemLoader(self.templates_dir))
# Add custom filter for random sampling
def random_sample(items, count):
import random
items_list = list(items)
return random.sample(items_list, min(count, len(items_list)))
self.env.filters['random_sample'] = random_sample
# Setup markdown with toc extension for header anchors
self.md = markdown.Markdown(extensions=['extra', 'toc'])
def parse_filename(self, filename):
"""Parse filename format: YYYY-MM-DD_type_name[_feed].md"""
pattern = r'(\d{4}-\d{2}-\d{2})_(short|long|page)_(.+?)(?:_([a-z0-9-]+))?\.md'
match = re.match(pattern, filename)
if not match:
return None
date_str, post_type, name, feed = match.groups()
date = datetime.strptime(date_str, '%Y-%m-%d')
return {
'date': date,
'date_str': date.strftime('%Y-%m-%d'),
'type': post_type,
'name': name,
'feed': feed,
'filename': filename
}
def add_header_anchors(self, html_content):
"""Add anchor links to headers with IDs"""
# Pattern to match headers with id attributes: <h2 id="some-id">Text</h2>
def replace_header(match):
tag = match.group(1)
header_id = match.group(2)
text = match.group(3)
# Add anchor link with # symbol
return f'<{tag} id="{header_id}">{text} <a href="#{header_id}" class="header-anchor">#</a></{tag}>'
# Match h2-h6 tags with id attributes
pattern = r'<(h[2-6]) id="([^"]+)">([^<]+)<\/\1>'
return re.sub(pattern, replace_header, html_content)
def read_post(self, filepath):
"""Read markdown file and extract title and content"""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Extract title (first # heading)
title_match = re.match(r'^#\s+(.+)$', content, re.MULTILINE)
title = title_match.group(1) if title_match else 'Untitled'
# Remove title from content
if title_match:
content = content[title_match.end():].strip()
# Convert markdown to HTML
html_content = self.md.convert(content)
# Add anchor links to headers
html_content = self.add_header_anchors(html_content)
return title, html_content
def collect_posts(self):
"""Collect and parse all posts from items directory"""
posts = []
if not self.items_dir.exists():
print(f"Warning: {self.items_dir} does not exist")
return posts
for filepath in self.items_dir.glob('*.md'):
parsed = self.parse_filename(filepath.name)
if not parsed:
print(f"Skipping {filepath.name}: doesn't match naming convention")
continue
title, content = self.read_post(filepath)
post = {
'date': parsed['date_str'],
'type': parsed['type'],
'name': parsed['name'],
'title': title,
'content': content,
'slug': parsed['name'],
'url': f"/{parsed['name']}/",
'feed': parsed['feed'],
'source': filepath.name
}
posts.append(post)
# Sort by date, newest first
posts.sort(key=lambda x: x['date'], reverse=True)
return posts
def generate_index(self, posts, feed_name=None, all_posts=None):
"""Generate index.html with all posts (or feed-specific index)"""
template = self.env.get_template('index.tmpl')
if feed_name:
title = f"{feed_name} - {self.blog_title}"
output_path = self.output_dir / 'feed' / feed_name / 'index.html'
else:
title = self.blog_title
output_path = self.output_dir / 'index.html'
html = template.render(
title=title,
blog_title=self.blog_title,
blog_description=self.blog_description,
navbar_items=self.navbar_items,
posts=posts,
all_posts=all_posts or posts,
hide_logo=self.hide_logo,
hide_title=self.hide_title,
logo_path=self.logo_path,
rss_feed_enabled=ENABLE_RSS_FEED,
rss_feed_path=RSS_FEED_PATH
)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html)
print(f"✓ Generated {output_path}")
def generate_feeds_overview(self, feeds, all_posts=None):
"""Generate /feed/index.html with list of all non-excluded feeds"""
template = self.env.get_template('feeds.tmpl')
# Prepare feed data with counts, excluding feeds in EXCLUDE_FEEDS_FROM_MAIN
feed_list = []
for feed_name, posts in sorted(feeds.items()):
if feed_name not in self.exclude_feeds:
feed_list.append({
'name': feed_name,
'count': len(posts)
})
title = f"Feeds - {self.blog_title}"
output_path = self.output_dir / 'feed' / 'index.html'
html = template.render(
title=title,
blog_title=self.blog_title,
blog_description=self.blog_description,
navbar_items=self.navbar_items,
feeds=feed_list,
all_posts=all_posts or [],
hide_logo=self.hide_logo,
hide_title=self.hide_title,
logo_path=self.logo_path,
rss_feed_enabled=ENABLE_RSS_FEED,
rss_feed_path=RSS_FEED_PATH
)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html)
print(f"✓ Generated {output_path}")
def generate_post_page(self, post, all_posts=None):
"""Generate individual post page for 'long' posts"""
template = self.env.get_template('post.tmpl')
html = template.render(
title=f"{post['title']} - {self.blog_title}",
blog_title=self.blog_title,
blog_description=self.blog_description,
navbar_items=self.navbar_items,
post=post,
all_posts=all_posts or [],
hide_logo=self.hide_logo,
hide_title=self.hide_title,
logo_path=self.logo_path,
rss_feed_enabled=ENABLE_RSS_FEED,
rss_feed_path=RSS_FEED_PATH
)
# Create directory for the post slug
post_dir = self.output_dir / post['slug']
post_dir.mkdir(exist_ok=True)
# Generate index.html inside the slug directory
output_path = post_dir / 'index.html'
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html)
print(f"✓ Generated {output_path}")
def generate_rss_feed(self, posts):
"""Generate RSS 2.0 feed for main feed posts"""
from xml.etree.ElementTree import Element, SubElement, tostring, register_namespace
from xml.dom import minidom
import re
# Register atom namespace to avoid ns0 prefix
register_namespace('atom', 'http://www.w3.org/2005/Atom')
# Limit posts
posts = posts[:FEED_MAX_ITEMS]
# Build feed URL correctly - ensure no double slashes
feed_path = RSS_FEED_PATH.lstrip('/')
# Remove trailing slash from BASE_URL if present for clean URL construction
base_url_clean = BASE_URL.rstrip('/')
feed_url = f"{base_url_clean}/{feed_path}"
# Create RSS element (namespace will be added automatically when we use atom:link)
rss = Element('rss', version='2.0')
channel = SubElement(rss, 'channel')
# Channel metadata
SubElement(channel, 'title').text = self.blog_title
SubElement(channel, 'description').text = self.blog_description
SubElement(channel, 'link').text = base_url_clean
# Add atom:link with rel="self" (required by RSS best practices)
atom_link = SubElement(channel, '{http://www.w3.org/2005/Atom}link')
atom_link.set('href', feed_url)
atom_link.set('rel', 'self')
atom_link.set('type', 'application/rss+xml')
SubElement(channel, 'lastBuildDate').text = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S +0000')
# Add author information (managingEditor format: email (name))
if AUTHOR_EMAIL and AUTHOR_NAME:
SubElement(channel, 'managingEditor').text = f"{AUTHOR_EMAIL} ({AUTHOR_NAME})"
elif AUTHOR_EMAIL:
SubElement(channel, 'managingEditor').text = AUTHOR_EMAIL
# Helper function to convert relative URLs to absolute
def make_absolute_urls(html_content):
# Replace relative URLs with absolute ones
html_content = re.sub(r'href="/', f'href="{base_url_clean}/', html_content)
html_content = re.sub(r'src="/', f'src="{base_url_clean}/', html_content)
return html_content
# Add items
for post in posts:
item = SubElement(channel, 'item')
SubElement(item, 'title').text = post['title']
SubElement(item, 'link').text = f"{base_url_clean}{post['url']}"
SubElement(item, 'guid', isPermaLink='true').text = f"{base_url_clean}{post['url']}"
SubElement(item, 'pubDate').text = datetime.strptime(post['date'], '%Y-%m-%d').strftime('%a, %d %b %Y 00:00:00 +0000')
# Content type based on post type
if post['type'] == 'long':
# For long posts, just show title/summary
SubElement(item, 'description').text = f"Read more at {base_url_clean}{post['url']}"
else:
# For short posts, include full content with absolute URLs
content_absolute = make_absolute_urls(post['content'])
SubElement(item, 'description').text = content_absolute
# Pretty print XML
xml_str = minidom.parseString(tostring(rss, encoding='utf-8')).toprettyxml(indent=' ', encoding='utf-8')
# Write to file
output_path = self.output_dir / RSS_FEED_PATH
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'wb') as f:
f.write(xml_str)
print(f"✓ Generated {output_path}")
def copy_assets(self):
"""Copy theme assets and images to output directory"""
import shutil
# Copy theme assets
if self.assets_dir.exists():
dest_dir = self.output_dir / 'assets'
if dest_dir.exists():
shutil.rmtree(dest_dir)
shutil.copytree(self.assets_dir, dest_dir)
print(f"✓ Copied theme assets to output")
# Copy images
images_dir = Path('images')
if images_dir.exists():
dest_dir = self.output_dir / 'images'
if dest_dir.exists():
shutil.rmtree(dest_dir)
shutil.copytree(images_dir, dest_dir)
print(f"✓ Copied images/ to output")
# Copy static files (GPG keys, .well-known, etc.)
static_dir = Path('static')
if static_dir.exists():
for item in static_dir.rglob('*'):
if item.is_file():
# Preserve directory structure
rel_path = item.relative_to(static_dir)
dest_path = self.output_dir / rel_path
dest_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(item, dest_path)
print(f"✓ Copied static/ to output")
def generate(self):
"""Main generation process"""
print(f"Starting picopaper generation with theme '{self.theme}'...")
# Create output directory
self.output_dir.mkdir(exist_ok=True)
# Collect posts
all_posts = self.collect_posts()
print(f"Found {len(all_posts)} posts")
# Filter out pages and excluded feeds from main feed
feed_posts = [p for p in all_posts
if p['type'] != 'page'
and p['feed'] not in self.exclude_feeds]
# Generate main index with filtered feed posts
self.generate_index(feed_posts, all_posts=feed_posts)
# Group posts by feed (include all posts, not just those in main feed)
feeds = {}
for post in all_posts:
if post['feed'] and post['type'] != 'page':
feeds.setdefault(post['feed'], []).append(post)
# Generate feed-specific pages
for feed_name, posts in feeds.items():
self.generate_index(posts, feed_name, all_posts=feed_posts)
# Generate feeds overview page
if feeds:
self.generate_feeds_overview(feeds, all_posts=feed_posts)
# Generate individual pages for long posts, short posts, and pages
for post in all_posts:
if post['type'] in ['long', 'short', 'page']:
self.generate_post_page(post, all_posts=feed_posts)
# Generate RSS feed
if ENABLE_RSS_FEED:
self.generate_rss_feed(feed_posts)
# Copy assets
self.copy_assets()
print(f"\n✓ Site generated successfully in {self.output_dir}/")
def main():
generator = SSGGGenerator()
generator.generate()
if __name__ == '__main__':
main()