diff --git a/.github/changelog/content-parser b/.github/changelog/content-parser new file mode 100644 index 0000000..5beee3d --- /dev/null +++ b/.github/changelog/content-parser @@ -0,0 +1,4 @@ +Significance: minor +Type: added + +Add rich content support for standard.site documents using the Markpub format. diff --git a/composer.json b/composer.json index d0940c5..887285b 100644 --- a/composer.json +++ b/composer.json @@ -27,8 +27,7 @@ }, "autoload": { "classmap": [ - "includes/", - "integrations/" + "includes/" ] }, "scripts": { diff --git a/includes/class-atmosphere.php b/includes/class-atmosphere.php index 8110424..b0099ed 100644 --- a/includes/class-atmosphere.php +++ b/includes/class-atmosphere.php @@ -13,7 +13,6 @@ use Atmosphere\Transformer\Document; use Atmosphere\Transformer\Publication; use Atmosphere\Transformer\TID; -use Atmosphere\Integrations\Load; use Atmosphere\WP_Admin\Admin; /** @@ -41,8 +40,8 @@ public function init(): void { \add_action( 'init', array( $this, 'register_wellknown_rewrite' ) ); \add_action( 'template_redirect', array( $this, 'serve_wellknown_publication' ) ); - // Plugin integrations. - Load::init(); + // JSON preview for AT Protocol records. + \add_action( 'template_redirect', array( $this, 'preview' ) ); // Post lifecycle hooks. \add_action( 'transition_post_status', array( $this, 'on_status_change' ), 10, 3 ); @@ -145,6 +144,45 @@ public function serve_wellknown_publication(): void { exit; } + /** + * Serve a JSON preview of the AT Protocol record for a post. + * + * Append ?atproto to a singular post URL to see the document + * record JSON. Optionally pass ?atproto={parser} to preview + * with a specific content parser (requires the parser to be + * registered via the atmosphere_content_parser filter). + */ + public function preview(): void { + // phpcs:ignore WordPress.Security.NonceVerification.Recommended + if ( ! isset( $_GET['atproto'] ) || ! \is_singular() ) { + return; + } + + if ( ! \current_user_can( 'edit_posts' ) ) { + return; + } + + $post = \get_queried_object(); + + if ( ! $post instanceof \WP_Post ) { + return; + } + + if ( ! \in_array( $post->post_type, Backfill::syncable_post_types(), true ) ) { + \status_header( 404 ); + exit; + } + + $transformer = new Document( $post ); + $record = $transformer->transform(); + + \status_header( 200 ); + \header( 'Content-Type: application/json; charset=utf-8' ); + // phpcs:ignore WordPress.WP.AlternativeFunctions.json_encode_json_encode + echo \wp_json_encode( $record, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE ); + exit; + } + /** * Handle post status transitions. * diff --git a/includes/content-parser/class-markpub.php b/includes/content-parser/class-markpub.php new file mode 100644 index 0000000..0b31bc6 --- /dev/null +++ b/includes/content-parser/class-markpub.php @@ -0,0 +1,375 @@ + 'at.markpub.markdown', + 'text' => array( + '$type' => 'at.markpub.text', + 'markdown' => $markdown, + ), + 'flavor' => 'gfm', + 'extensions' => array( 'strikethrough', 'table' ), + ); + } + + /** + * Convert a single WordPress block to markdown. + * + * @param array $block Parsed block from parse_blocks(). + * @return string|null Markdown string or null to skip. + */ + private static function transform_block( array $block ): ?string { + if ( empty( $block['blockName'] ) ) { + // Classic (non-block) content or whitespace. + $html = \trim( $block['innerHTML'] ?? '' ); + if ( empty( $html ) ) { + return null; + } + + return self::inline_html_to_markdown( $html ); + } + + return match ( $block['blockName'] ) { + 'core/paragraph' => self::paragraph( $block ), + 'core/heading' => self::heading( $block ), + 'core/image' => self::image( $block ), + 'core/list' => self::listing( $block ), + 'core/quote' => self::quote( $block ), + 'core/code' => self::code( $block ), + 'core/preformatted' => self::preformatted( $block ), + 'core/separator' => '---', + 'core/spacer' => null, + 'core/group', + 'core/columns', + 'core/column' => self::container( $block ), + default => self::fallback( $block ), + }; + } + + /** + * Paragraph block. + * + * @param array $block Parsed block. + * @return string|null + */ + private static function paragraph( array $block ): ?string { + $html = \trim( $block['innerHTML'] ?? '' ); + if ( empty( $html ) ) { + return null; + } + + return self::inline_html_to_markdown( $html ); + } + + /** + * Heading block. + * + * @param array $block Parsed block. + * @return string|null + */ + private static function heading( array $block ): ?string { + $level = $block['attrs']['level'] ?? 2; + $text = self::inline_html_to_markdown( $block['innerHTML'] ?? '' ); + + if ( empty( \trim( $text ) ) ) { + return null; + } + + return \str_repeat( '#', (int) $level ) . ' ' . \trim( $text ); + } + + /** + * Image block. + * + * @param array $block Parsed block. + * @return string|null + */ + private static function image( array $block ): ?string { + $html = $block['innerHTML'] ?? ''; + $src = ''; + $alt = ''; + + $processor = new \WP_HTML_Tag_Processor( $html ); + if ( $processor->next_tag( 'IMG' ) ) { + $src = $processor->get_attribute( 'src' ) ?? ''; + $alt = $processor->get_attribute( 'alt' ) ?? ''; + } + + if ( empty( $src ) ) { + return null; + } + + $md = '![' . $alt . '](' . $src . ')'; + + // Check for a caption in figcaption. + $caption_proc = new \WP_HTML_Tag_Processor( $html ); + if ( $caption_proc->next_tag( 'FIGCAPTION' ) ) { + $caption = \wp_strip_all_tags( + \preg_replace( '#.*]*>#si', '', $html ) + ); + $caption = \trim( \preg_replace( '#.*#si', '', $caption ) ); + + if ( ! empty( $caption ) ) { + $md .= "\n" . $caption; + } + } + + return $md; + } + + /** + * List block. + * + * @param array $block Parsed block. + * @return string|null + */ + private static function listing( array $block ): ?string { + $ordered = ! empty( $block['attrs']['ordered'] ); + $items = array(); + + if ( ! empty( $block['innerBlocks'] ) ) { + foreach ( $block['innerBlocks'] as $i => $inner ) { + $text = self::inline_html_to_markdown( $inner['innerHTML'] ?? '' ); + $text = \trim( $text ); + + if ( empty( $text ) ) { + continue; + } + + $prefix = $ordered ? ( $i + 1 ) . '. ' : '- '; + $items[] = $prefix . $text; + } + } + + return empty( $items ) ? null : \implode( "\n", $items ); + } + + /** + * Quote block. + * + * @param array $block Parsed block. + * @return string|null + */ + private static function quote( array $block ): ?string { + $lines = array(); + + if ( ! empty( $block['innerBlocks'] ) ) { + foreach ( $block['innerBlocks'] as $inner ) { + $md = self::transform_block( $inner ); + if ( null !== $md ) { + $lines[] = $md; + } + } + } + + if ( empty( $lines ) ) { + $text = self::inline_html_to_markdown( $block['innerHTML'] ?? '' ); + if ( empty( \trim( $text ) ) ) { + return null; + } + $lines = array( \trim( $text ) ); + } + + $quoted = \implode( "\n", $lines ); + + // Prefix each line with >. + return \implode( + "\n", + \array_map( + static fn( $line ) => '> ' . $line, + \explode( "\n", $quoted ) + ) + ); + } + + /** + * Code block. + * + * @param array $block Parsed block. + * @return string|null + */ + private static function code( array $block ): ?string { + $text = \wp_strip_all_tags( $block['innerHTML'] ?? '' ); + $text = \html_entity_decode( $text, ENT_QUOTES, 'UTF-8' ); + $text = \trim( $text ); + + if ( empty( $text ) ) { + return null; + } + + $lang = $block['attrs']['language'] ?? ''; + + return '```' . $lang . "\n" . $text . "\n```"; + } + + /** + * Preformatted block. + * + * @param array $block Parsed block. + * @return string|null + */ + private static function preformatted( array $block ): ?string { + return self::code( $block ); + } + + /** + * Container block — flatten inner blocks. + * + * @param array $block Parsed block. + * @return string|null + */ + private static function container( array $block ): ?string { + if ( empty( $block['innerBlocks'] ) ) { + return null; + } + + $parts = array(); + + foreach ( $block['innerBlocks'] as $inner ) { + $md = self::transform_block( $inner ); + if ( null !== $md ) { + $parts[] = $md; + } + } + + return empty( $parts ) ? null : \implode( "\n\n", $parts ); + } + + /** + * Fallback for unknown block types. + * + * @param array $block Parsed block. + * @return string|null + */ + private static function fallback( array $block ): ?string { + if ( ! empty( $block['innerBlocks'] ) ) { + return self::container( $block ); + } + + $html = \trim( $block['innerHTML'] ?? '' ); + if ( empty( $html ) ) { + return null; + } + + return self::inline_html_to_markdown( $html ); + } + + /** + * Convert inline HTML formatting to markdown. + * + * Handles links, bold, italic, strikethrough, inline code, + * images, and line breaks. Strips block-level wrappers and + * remaining tags. + * + * @param string $html HTML string. + * @return string Markdown string. + */ + private static function inline_html_to_markdown( string $html ): string { + $html = \trim( $html ); + + if ( empty( $html ) ) { + return ''; + } + + $md = $html; + + // Inline images. + $md = \preg_replace_callback( + '#]+>#si', + static function ( $m ) { + $processor = new \WP_HTML_Tag_Processor( $m[0] ); + if ( $processor->next_tag( 'IMG' ) ) { + $src = $processor->get_attribute( 'src' ) ?? ''; + $alt = $processor->get_attribute( 'alt' ) ?? ''; + return '![' . $alt . '](' . $src . ')'; + } + return ''; + }, + $md + ); + + // Links. + $md = \preg_replace_callback( + '#]+href=["\']([^"\']*)["\'][^>]*>(.*?)#si', + static fn( $m ) => '[' . \wp_strip_all_tags( $m[2] ) . '](' . $m[1] . ')', + $md + ); + + // Bold. + $md = \preg_replace( '#<(?:strong|b)>(.*?)#si', '**$1**', $md ); + + // Italic. + $md = \preg_replace( '#<(?:em|i)>(.*?)#si', '*$1*', $md ); + + // Strikethrough. + $md = \preg_replace( '#<(?:s|del|strike)>(.*?)#si', '~~$1~~', $md ); + + // Inline code. + $md = \preg_replace( '#(.*?)#si', '`$1`', $md ); + + // Line breaks. + $md = \preg_replace( '##si', " \n", $md ); + + // Strip block-level wrappers and remaining tags. + $md = \wp_strip_all_tags( $md ); + + // Decode HTML entities. + $md = \html_entity_decode( $md, ENT_QUOTES, 'UTF-8' ); + + return \trim( $md ); + } +} diff --git a/includes/content-parser/interface-content-parser.php b/includes/content-parser/interface-content-parser.php new file mode 100644 index 0000000..4f11827 --- /dev/null +++ b/includes/content-parser/interface-content-parser.php @@ -0,0 +1,42 @@ + $this->to_iso8601( $this->object->post_date_gmt ), ); - // Publication reference. + // Publication reference (required by spec). $pub_tid = \get_option( 'atmosphere_publication_tid' ); if ( $pub_tid ) { $record['site'] = build_at_uri( get_did(), 'site.standard.publication', $pub_tid ); + } else { + // Fall back to site URL for standalone documents. + $record['site'] = \untrailingslashit( \get_home_url() ); } // Relative path. @@ -89,6 +94,12 @@ public function transform(): array { $record['textContent'] = $text_content; } + // Parsed rich content (open union). + $content = $this->get_content(); + if ( ! empty( $content ) ) { + $record['content'] = $content; + } + // Tags. $tags = $this->collect_tags( $this->object ); if ( ! empty( $tags ) ) { @@ -140,6 +151,43 @@ public function get_rkey(): string { return $rkey; } + /** + * Get parsed content for the document's content union field. + * + * @return array|null Parsed content object or null. + */ + private function get_content(): ?array { + if ( empty( \trim( $this->object->post_content ) ) ) { + return null; + } + + /** + * Filters the content parser used for site.standard.document records. + * + * Return a Content_Parser instance to override the default parser. + * Return null to disable the content field entirely. + * + * @param Content_Parser|null $parser The content parser. Default: Markpub. + * @param \WP_Post $post The WordPress post. + */ + $parser = \apply_filters( 'atmosphere_content_parser', new Markpub(), $this->object ); + + if ( ! $parser instanceof Content_Parser ) { + return null; + } + + $content = $parser->parse( $this->object->post_content, $this->object ); + + /** + * Filters the parsed content object before adding to the document record. + * + * @param array $content The parsed content object. + * @param \WP_Post $post The WordPress post. + * @param Content_Parser $parser The parser that produced the content. + */ + return \apply_filters( 'atmosphere_document_content', $content, $this->object, $parser ); + } + /** * Render post content to plain text. * diff --git a/tests/phpunit/tests/content-parser/class-test-markpub.php b/tests/phpunit/tests/content-parser/class-test-markpub.php new file mode 100644 index 0000000..99409ec --- /dev/null +++ b/tests/phpunit/tests/content-parser/class-test-markpub.php @@ -0,0 +1,234 @@ +parser = new Markpub(); + } + + /** + * Test get_type returns the markpub NSID. + */ + public function test_get_type() { + $this->assertSame( 'at.markpub.markdown', $this->parser->get_type() ); + } + + /** + * Test parse returns correct top-level structure. + */ + public function test_parse_returns_correct_structure() { + $post = self::factory()->post->create_and_get(); + $result = $this->parser->parse( + '

Hello world

', + $post + ); + + $this->assertArrayHasKey( '$type', $result ); + $this->assertSame( 'at.markpub.markdown', $result['$type'] ); + $this->assertArrayHasKey( 'text', $result ); + $this->assertSame( 'at.markpub.text', $result['text']['$type'] ); + $this->assertArrayHasKey( 'markdown', $result['text'] ); + $this->assertSame( 'gfm', $result['flavor'] ); + $this->assertContains( 'strikethrough', $result['extensions'] ); + } + + /** + * Test paragraph conversion. + */ + public function test_converts_paragraphs() { + $post = self::factory()->post->create_and_get(); + $content = "\n

First paragraph

\n\n\n" + . "\n

Second paragraph

\n"; + + $result = $this->parser->parse( $content, $post ); + $markdown = $result['text']['markdown']; + + $this->assertStringContainsString( 'First paragraph', $markdown ); + $this->assertStringContainsString( 'Second paragraph', $markdown ); + $this->assertStringNotContainsString( '

', $markdown ); + } + + /** + * Test heading conversion. + */ + public function test_converts_headings() { + $post = self::factory()->post->create_and_get(); + $content = '

My Heading

'; + $result = $this->parser->parse( $content, $post ); + + $this->assertStringContainsString( '## My Heading', $result['text']['markdown'] ); + } + + /** + * Test heading level 3. + */ + public function test_converts_heading_level_3() { + $post = self::factory()->post->create_and_get(); + $content = '

Sub Heading

'; + $result = $this->parser->parse( $content, $post ); + + $this->assertStringContainsString( '### Sub Heading', $result['text']['markdown'] ); + } + + /** + * Test link conversion in a paragraph. + */ + public function test_converts_links() { + $post = self::factory()->post->create_and_get(); + $content = '

Visit Example today.

'; + $result = $this->parser->parse( $content, $post ); + + $this->assertStringContainsString( '[Example](https://example.com)', $result['text']['markdown'] ); + } + + /** + * Test bold conversion. + */ + public function test_converts_bold() { + $post = self::factory()->post->create_and_get(); + $content = '

This is bold text.

'; + $result = $this->parser->parse( $content, $post ); + + $this->assertStringContainsString( '**bold**', $result['text']['markdown'] ); + } + + /** + * Test italic conversion. + */ + public function test_converts_italic() { + $post = self::factory()->post->create_and_get(); + $content = '

This is italic text.

'; + $result = $this->parser->parse( $content, $post ); + + $this->assertStringContainsString( '*italic*', $result['text']['markdown'] ); + } + + /** + * Test image block conversion. + */ + public function test_converts_images() { + $post = self::factory()->post->create_and_get(); + $content = '
A photo
'; + $result = $this->parser->parse( $content, $post ); + + $this->assertStringContainsString( '![A photo](https://example.com/photo.jpg)', $result['text']['markdown'] ); + } + + /** + * Test code block conversion. + */ + public function test_converts_code_blocks() { + $post = self::factory()->post->create_and_get(); + $content = '
echo "hello";
'; + $result = $this->parser->parse( $content, $post ); + $md = $result['text']['markdown']; + + $this->assertStringContainsString( '```', $md ); + $this->assertStringContainsString( 'echo "hello";', $md ); + } + + /** + * Test inline code conversion. + */ + public function test_converts_inline_code() { + $post = self::factory()->post->create_and_get(); + $content = '

Use the parse() method.

'; + $result = $this->parser->parse( $content, $post ); + + $this->assertStringContainsString( '`parse()`', $result['text']['markdown'] ); + } + + /** + * Test separator block becomes horizontal rule. + */ + public function test_converts_separator() { + $post = self::factory()->post->create_and_get(); + $content = "

Before

\n\n" + . "
\n\n" + . '

After

'; + $result = $this->parser->parse( $content, $post ); + + $this->assertStringContainsString( '---', $result['text']['markdown'] ); + } + + /** + * Test empty content produces empty markdown. + */ + public function test_empty_content() { + $post = self::factory()->post->create_and_get(); + $result = $this->parser->parse( '', $post ); + + $this->assertSame( '', $result['text']['markdown'] ); + } + + /** + * Test the atmosphere_html_to_markdown filter. + */ + public function test_html_to_markdown_filter() { + \add_filter( + 'atmosphere_html_to_markdown', + static fn() => 'custom markdown', + 10, + 2 + ); + + $post = self::factory()->post->create_and_get(); + $result = $this->parser->parse( + '

Hello

', + $post + ); + + $this->assertSame( 'custom markdown', $result['text']['markdown'] ); + + \remove_all_filters( 'atmosphere_html_to_markdown' ); + } + + /** + * Test strikethrough conversion. + */ + public function test_converts_strikethrough() { + $post = self::factory()->post->create_and_get(); + $content = '

This is deleted text.

'; + $result = $this->parser->parse( $content, $post ); + + $this->assertStringContainsString( '~~deleted~~', $result['text']['markdown'] ); + } + + /** + * Test classic (non-block) content is handled as fallback. + */ + public function test_classic_content_fallback() { + $post = self::factory()->post->create_and_get(); + $result = $this->parser->parse( '

Classic editor content with bold.

', $post ); + $md = $result['text']['markdown']; + + $this->assertStringContainsString( '**bold**', $md ); + $this->assertStringContainsString( 'Classic editor content', $md ); + } +}