Skip to content

Commit 1f0f708

Browse files
committed
[Data Liberation] Add Epub importer
Description TBD Use WP_XML_Reader for EPubs, support simple DOCTYPE declarations in XML Parse EPubs as XHTML
1 parent d400031 commit 1f0f708

File tree

10 files changed

+350
-42
lines changed

10 files changed

+350
-42
lines changed

packages/playground/data-liberation/bootstrap.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161

6262
require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php';
6363
require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php';
64+
require_once __DIR__ . '/src/entity-readers/WP_EPub_Entity_Reader.php';
6465
require_once __DIR__ . '/src/entity-readers/WP_WXR_Entity_Reader.php';
6566
require_once __DIR__ . '/src/entity-readers/WP_Directory_Tree_Entity_Reader.php';
6667

packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,15 @@ class WP_HTML_To_Blocks implements WP_Block_Markup_Converter {
2828

2929
private $state = self::STATE_READY;
3030
private $block_stack = array();
31-
private $html;
31+
private $markup_processor;
3232
private $ignore_text = false;
3333
private $in_ephemeral_paragraph = false;
3434
private $block_markup = '';
3535
private $metadata = array();
36+
private $last_error = null;
3637

3738
public function __construct( $html ) {
38-
$this->html = WP_HTML_Processor::create_fragment( $html );
39+
$this->markup_processor = WP_HTML_Processor::create_fragment( $html );
3940
}
4041

4142
/**
@@ -46,21 +47,27 @@ public function convert() {
4647
return false;
4748
}
4849

49-
while ( $this->html->next_token() ) {
50-
switch ( $this->html->get_token_type() ) {
50+
while ( $this->markup_processor->next_token() ) {
51+
switch ( $this->markup_processor->get_token_type() ) {
5152
case '#text':
5253
if ( $this->ignore_text ) {
5354
break;
5455
}
55-
$this->append_html( htmlspecialchars( $this->html->get_modifiable_text() ) );
56+
$this->append_html( htmlspecialchars( $this->markup_processor->get_modifiable_text() ) );
5657
break;
5758
case '#tag':
5859
$this->handle_tag();
5960
break;
6061
}
6162
}
6263

64+
if ( $this->markup_processor->get_last_error() ) {
65+
$this->last_error = $this->markup_processor->get_last_error();
66+
return false;
67+
}
68+
6369
$this->close_ephemeral_paragraph();
70+
6471
return true;
6572
}
6673

@@ -93,8 +100,8 @@ public function get_block_markup() {
93100
* or metadata.
94101
*/
95102
private function handle_tag() {
96-
$html = $this->html;
97-
$tag = $html->get_tag();
103+
$html = $this->markup_processor;
104+
$tag = strtoupper( $html->get_tag() );
98105
$tag_lowercase = strtolower( $tag );
99106

100107
$is_opener = ! $html->is_tag_closer() && $html->expects_closer();
@@ -130,9 +137,9 @@ private function handle_tag() {
130137
$this->push_block( 'html' );
131138
$template = new \WP_HTML_Tag_Processor( '<input>' );
132139
$template->next_tag();
133-
$attrs = $this->html->get_attribute_names_with_prefix( '' );
140+
$attrs = $this->markup_processor->get_attribute_names_with_prefix( '' );
134141
foreach ( $attrs as $attr ) {
135-
$template->set_attribute( $attr, $this->html->get_attribute( $attr ) );
142+
$template->set_attribute( $attr, $this->markup_processor->get_attribute( $attr ) );
136143
}
137144
$this->append_html( htmlspecialchars( $template->get_updated_html() ) );
138145
$this->pop_block();
@@ -337,7 +344,7 @@ private function should_preserve_tag_in_rich_text( $tag ) {
337344
}
338345

339346
private function is_at_inline_code_element() {
340-
$breadcrumbs = $this->html->get_breadcrumbs();
347+
$breadcrumbs = $this->markup_processor->get_breadcrumbs();
341348
foreach ( $breadcrumbs as $tag ) {
342349
switch ( $tag ) {
343350
case 'A':
@@ -425,4 +432,8 @@ private function close_ephemeral_paragraph() {
425432
$this->in_ephemeral_paragraph = false;
426433
}
427434
}
435+
436+
public function get_last_error() {
437+
return $this->last_error;
438+
}
428439
}
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
<?php
2+
3+
use WordPress\Zip\WP_Zip_Filesystem;
4+
5+
/**
6+
* https://www.w3.org/AudioVideo/ebook/
7+
*
8+
* An EPUB Publication is transported as a single file (a "portable document") that contains:
9+
* * a Package Document (OPF file) which specifies all the Publication's constituent content documents and their required resources, defines a reading order and associates Publication-level metadata and navigation information.
10+
* * A metadata element including and/or referencing metadata applicable to the entire Publication and particular resources within it.
11+
* * A manifest element: identifies (via IRI) and describes (via MIME media type) the set of resources that constitute the EPUB Publication.
12+
* * A spine element : defines the default reading order of the Publication. (An ordered list of Publication Resources (EPUB Content Documents).
13+
* * A Bindings element defines a set of custom handlers for media types not supported by EPUB3. If the Reading System cannot support the specific media type, it could use scripting fallback if supported.
14+
* * all Content Documents
15+
* * all other required resources for processing the Publication.
16+
*
17+
* The OCF Container is packaged into a physical single ZIP file containing:
18+
* * Mime Type file: application/epub+zip.
19+
* * META-INF folder (container file which points to the location of the .opf file), signatures, encryption, rights, are xml files
20+
* * OEBPS folder stores the book content .(opf, ncx, html, svg, png, css, etc. files)
21+
*/
22+
class WP_EPub_Entity_Reader extends WP_Entity_Reader {
23+
24+
protected $zip;
25+
protected $finished = false;
26+
protected $current_post_id;
27+
protected $remaining_html_files;
28+
protected $current_html_reader;
29+
protected $last_error;
30+
public function __construct( WP_Zip_Filesystem $zip, $first_post_id = 1 ) {
31+
$this->zip = $zip;
32+
$this->current_post_id = $first_post_id;
33+
}
34+
35+
public function next_entity() {
36+
if ( $this->last_error ) {
37+
return false;
38+
}
39+
40+
if ( $this->finished ) {
41+
return false;
42+
}
43+
44+
if ( null === $this->remaining_html_files ) {
45+
$path = false;
46+
foreach ( array( '/OEBPS', '/EPUB' ) as $path_candidate ) {
47+
if ( $this->zip->is_dir( $path_candidate ) ) {
48+
$path = $path_candidate;
49+
break;
50+
}
51+
}
52+
if ( false === $path ) {
53+
_doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' );
54+
$this->finished = true;
55+
return false;
56+
}
57+
58+
$files = $this->zip->ls( $path );
59+
if ( false === $files ) {
60+
_doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' );
61+
$this->finished = true;
62+
return false;
63+
}
64+
$this->remaining_html_files = array();
65+
foreach ( $files as $file ) {
66+
if ( str_ends_with( $file, '.xhtml' ) || str_ends_with( $file, '.html' ) ) {
67+
$this->remaining_html_files[] = $path . '/' . $file;
68+
}
69+
}
70+
}
71+
72+
while ( true ) {
73+
if ( null !== $this->current_html_reader ) {
74+
if (
75+
! $this->current_html_reader->is_finished() &&
76+
$this->current_html_reader->next_entity()
77+
) {
78+
return true;
79+
}
80+
if ( $this->current_html_reader->get_last_error() ) {
81+
_doing_it_wrong(
82+
__METHOD__,
83+
'The EPUB file did not contain any HTML files.',
84+
'1.0.0'
85+
);
86+
$this->finished = true;
87+
return false;
88+
}
89+
}
90+
91+
if ( count( $this->remaining_html_files ) === 0 ) {
92+
$this->finished = true;
93+
return false;
94+
}
95+
96+
$html_file = array_shift( $this->remaining_html_files );
97+
$html = $this->zip->read_file( $html_file );
98+
$this->current_html_reader = new WP_HTML_Entity_Reader(
99+
WP_XML_Processor::create_from_string( $html ),
100+
$this->current_post_id
101+
);
102+
if ( $this->current_html_reader->get_last_error() ) {
103+
$this->last_error = $this->current_html_reader->get_last_error();
104+
return false;
105+
}
106+
++$this->current_post_id;
107+
}
108+
109+
return false;
110+
}
111+
112+
public function get_entity() {
113+
return $this->current_html_reader->get_entity();
114+
}
115+
116+
public function is_finished(): bool {
117+
return $this->finished;
118+
}
119+
120+
public function get_last_error(): ?string {
121+
return $this->last_error;
122+
}
123+
}

packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,7 @@
55
*/
66
class WP_HTML_Entity_Reader extends WP_Entity_Reader {
77

8-
/**
9-
* The HTML document to convert.
10-
*
11-
* @var string
12-
*/
13-
protected $html;
14-
15-
/**
16-
* The emitted entities.
17-
*
18-
* @var array
19-
*/
8+
protected $html_processor;
209
protected $entities;
2110

2211
/**
@@ -32,16 +21,11 @@ class WP_HTML_Entity_Reader extends WP_Entity_Reader {
3221
* @var int
3322
*/
3423
protected $post_id;
24+
protected $last_error;
3525

36-
/**
37-
* Constructs the reader.
38-
*
39-
* @param string $html The HTML document to convert.
40-
* @param int $post_id The ID to use as `post_id` of the emitted post entity.
41-
*/
42-
public function __construct( $html, $post_id ) {
43-
$this->html = $html;
44-
$this->post_id = $post_id;
26+
public function __construct( $html_processor, $post_id ) {
27+
$this->html_processor = $html_processor;
28+
$this->post_id = $post_id;
4529
}
4630

4731
/**
@@ -66,8 +50,9 @@ public function next_entity() {
6650
}
6751

6852
// We did not read any entities yet. Let's convert the HTML document into entities.
69-
$converter = new WP_HTML_To_Blocks( $this->html );
53+
$converter = new WP_HTML_To_Blocks( $this->html_processor );
7054
if ( false === $converter->convert() ) {
55+
$this->last_error = $converter->get_last_error();
7156
return false;
7257
}
7358

@@ -135,6 +120,6 @@ public function is_finished(): bool {
135120
* @return string|null The last error, or null if there was no error.
136121
*/
137122
public function get_last_error(): ?string {
138-
return null;
123+
return $this->last_error;
139124
}
140125
}

0 commit comments

Comments
 (0)