Skip to main content

cadmus_core/document/html/
dom.rs

1use fxhash::{FxHashMap, FxHashSet};
2use std::num::NonZeroUsize;
3
4pub type Attributes = FxHashMap<String, String>;
5pub const WRAPPER_TAG_NAME: &str = "anonymous";
6
7#[derive(Debug, Clone)]
8pub enum NodeData {
9    Root,
10    Wrapper(usize),
11    Element(ElementData),
12    Text(TextData),
13    Whitespace(TextData),
14}
15
16#[derive(Debug, Clone)]
17pub struct ElementData {
18    pub offset: usize,
19    pub name: String,
20    pub qualified_name: Option<String>,
21    pub attributes: Attributes,
22    /// Set when an otherwise inline element contains block-level descendants
23    /// (invalid block-in-inline markup); it is then laid out as a block.
24    pub force_block: bool,
25}
26
27impl ElementData {
28    fn is_block(&self) -> bool {
29        self.force_block
30            || matches!(
31                self.name.as_str(),
32                "address"
33                    | "article"
34                    | "aside"
35                    | "blockquote"
36                    | "body"
37                    | "head"
38                    | "details"
39                    | "dialog"
40                    | "dd"
41                    | "div"
42                    | "dl"
43                    | "dt"
44                    | "fieldset"
45                    | "figcaption"
46                    | "figure"
47                    | "footer"
48                    | "form"
49                    | "h1"
50                    | "h2"
51                    | "h3"
52                    | "h4"
53                    | "h5"
54                    | "h6"
55                    | "header"
56                    | "hgroup"
57                    | "hr"
58                    | "html"
59                    | "li"
60                    | "main"
61                    | "nav"
62                    | "ol"
63                    | "p"
64                    | "pre"
65                    | "section"
66                    | "table"
67                    | "thead"
68                    | "colgroup"
69                    | "tbody"
70                    | "tfoot"
71                    | "tr"
72                    | "caption"
73                    | "td"
74                    | "th"
75                    | "ul"
76            )
77    }
78}
79
80impl NodeData {
81    fn text(&self) -> Option<&str> {
82        match *self {
83            NodeData::Text(TextData { ref text, .. })
84            | NodeData::Whitespace(TextData { ref text, .. }) => Some(text),
85            _ => None,
86        }
87    }
88
89    fn offset(&self) -> usize {
90        match *self {
91            NodeData::Text(TextData { offset, .. })
92            | NodeData::Whitespace(TextData { offset, .. })
93            | NodeData::Element(ElementData { offset, .. }) => offset,
94            NodeData::Wrapper(offset) => offset,
95            NodeData::Root => 0,
96        }
97    }
98}
99
100#[derive(Debug, Clone)]
101pub struct TextData {
102    pub offset: usize,
103    pub text: String,
104}
105
106pub fn element(name: &str, offset: usize, attributes: Attributes) -> NodeData {
107    let colon = name.find(':');
108    NodeData::Element(ElementData {
109        offset,
110        name: name[colon.map(|index| index + 1).unwrap_or(0)..].to_string(),
111        qualified_name: colon.map(|_| name.to_string()),
112        attributes,
113        force_block: false,
114    })
115}
116
117pub fn text(text: &str, offset: usize) -> NodeData {
118    NodeData::Text(TextData {
119        offset,
120        text: text.to_string(),
121    })
122}
123
124pub fn whitespace(text: &str, offset: usize) -> NodeData {
125    NodeData::Whitespace(TextData {
126        offset,
127        text: text.to_string(),
128    })
129}
130
131#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
132pub struct NodeId(NonZeroUsize);
133
134impl NodeId {
135    pub fn from_index(n: usize) -> Self {
136        NodeId(unsafe { NonZeroUsize::new_unchecked(n + 1) })
137    }
138
139    pub fn to_index(self) -> usize {
140        self.0.get() - 1
141    }
142}
143
144#[derive(Debug, Clone)]
145pub struct XmlTree {
146    nodes: Vec<Node>,
147}
148
149#[derive(Debug, Clone)]
150pub struct Node {
151    data: NodeData,
152    parent: Option<NodeId>,
153    previous_sibling: Option<NodeId>,
154    next_sibling: Option<NodeId>,
155    first_child: Option<NodeId>,
156    last_child: Option<NodeId>,
157}
158
159impl Default for Node {
160    fn default() -> Self {
161        Node {
162            data: NodeData::Root,
163            parent: None,
164            previous_sibling: None,
165            next_sibling: None,
166            first_child: None,
167            last_child: None,
168        }
169    }
170}
171
172#[derive(Debug, Copy, Clone)]
173pub struct NodeRef<'a> {
174    pub id: NodeId,
175    pub node: &'a Node,
176    pub tree: &'a XmlTree,
177}
178
179#[derive(Debug)]
180pub struct NodeMut<'a> {
181    pub id: NodeId,
182    pub tree: &'a mut XmlTree,
183}
184
185impl XmlTree {
186    pub fn new() -> Self {
187        XmlTree {
188            nodes: vec![Node::default()],
189        }
190    }
191
192    fn node(&self, id: NodeId) -> &Node {
193        unsafe { self.nodes.get_unchecked(id.to_index()) }
194    }
195
196    fn node_mut(&mut self, id: NodeId) -> &mut Node {
197        unsafe { self.nodes.get_unchecked_mut(id.to_index()) }
198    }
199
200    pub fn get(&self, id: NodeId) -> NodeRef<'_> {
201        NodeRef {
202            id,
203            node: self.node(id),
204            tree: self,
205        }
206    }
207
208    pub fn get_mut(&mut self, id: NodeId) -> NodeMut<'_> {
209        NodeMut { id, tree: self }
210    }
211
212    pub fn root(&self) -> NodeRef<'_> {
213        self.get(NodeId::from_index(0))
214    }
215
216    pub fn root_mut(&mut self) -> NodeMut<'_> {
217        self.get_mut(NodeId::from_index(0))
218    }
219
220    pub fn push_node(&mut self, data: NodeData) -> NodeId {
221        let id = NodeId::from_index(self.nodes.len());
222        let node = Node {
223            data,
224            parent: None,
225            previous_sibling: None,
226            next_sibling: None,
227            first_child: None,
228            last_child: None,
229        };
230        self.nodes.push(node);
231        id
232    }
233
234    pub fn attach_child(&mut self, parent_id: NodeId, child_id: NodeId) {
235        let last_child = self.node(parent_id).last_child;
236
237        let child = self.node_mut(child_id);
238        child.parent = Some(parent_id);
239        child.previous_sibling = last_child;
240        child.next_sibling = None;
241
242        if let Some(last) = last_child {
243            self.node_mut(last).next_sibling = Some(child_id);
244        }
245
246        self.node_mut(parent_id).last_child = Some(child_id);
247
248        if self.node(parent_id).first_child.is_none() {
249            self.node_mut(parent_id).first_child = Some(child_id);
250        }
251    }
252
253    pub fn insert_before(&mut self, sibling_id: NodeId, new_id: NodeId) {
254        let parent_id = self.node(sibling_id).parent;
255        let prev_sibling = self.node(sibling_id).previous_sibling;
256
257        self.node_mut(new_id).parent = parent_id;
258        self.node_mut(new_id).previous_sibling = prev_sibling;
259        self.node_mut(new_id).next_sibling = Some(sibling_id);
260
261        self.node_mut(sibling_id).previous_sibling = Some(new_id);
262
263        if let Some(prev) = prev_sibling {
264            self.node_mut(prev).next_sibling = Some(new_id);
265        } else if let Some(parent) = parent_id {
266            self.node_mut(parent).first_child = Some(new_id);
267        }
268    }
269
270    pub fn detach(&mut self, id: NodeId) {
271        let node = self.node(id);
272        let parent_id = node.parent;
273        let prev_sibling = node.previous_sibling;
274        let next_sibling = node.next_sibling;
275
276        if let Some(prev) = prev_sibling {
277            self.node_mut(prev).next_sibling = next_sibling;
278        } else if let Some(parent) = parent_id {
279            self.node_mut(parent).first_child = next_sibling;
280        }
281
282        if let Some(next) = next_sibling {
283            self.node_mut(next).previous_sibling = prev_sibling;
284        } else if let Some(parent) = parent_id {
285            self.node_mut(parent).last_child = prev_sibling;
286        }
287
288        self.node_mut(id).parent = None;
289        self.node_mut(id).previous_sibling = None;
290        self.node_mut(id).next_sibling = None;
291    }
292
293    pub fn append_text_to(&mut self, id: NodeId, extra: &str) {
294        match &mut self.node_mut(id).data {
295            NodeData::Text(TextData { text, .. }) | NodeData::Whitespace(TextData { text, .. }) => {
296                text.push_str(extra);
297            }
298            _ => {}
299        }
300    }
301
302    pub fn add_attr_if_missing(&mut self, id: NodeId, name: &str, value: &str) {
303        if let NodeData::Element(ElementData {
304            ref mut attributes, ..
305        }) = self.node_mut(id).data
306        {
307            if !attributes.contains_key(name) {
308                attributes.insert(name.to_string(), value.to_string());
309            }
310        }
311    }
312
313    /// Promote inline elements that contain block-level descendants to blocks.
314    ///
315    /// Such block-in-inline nesting is invalid HTML (e.g.
316    /// `<span><div>…</div></span>`, common in EPUB converter output) and would
317    /// otherwise be flattened into a single inline run by
318    /// `gather_inline_material`, silently dropping the block content. Must run
319    /// before `wrap_lost_inlines` so the promoted elements are not wrapped as
320    /// lost inlines.
321    fn promote_blockish_inlines(&mut self) {
322        let ids: Vec<NodeId> = self
323            .root()
324            .descendants()
325            .filter(|n| {
326                matches!(n.data(), NodeData::Element(..))
327                    && n.is_inline()
328                    && n.descendants().any(|d| d.is_block())
329            })
330            .map(|n| n.id)
331            .collect();
332
333        for id in ids {
334            if let NodeData::Element(e) = &mut self.node_mut(id).data {
335                e.force_block = true;
336            }
337        }
338    }
339
340    pub fn wrap_lost_inlines(&mut self) {
341        self.promote_blockish_inlines();
342
343        let mut ids = Vec::new();
344        let mut known_ids = FxHashSet::default();
345
346        for n in self.root().descendants().filter(|n| n.is_inline()) {
347            if known_ids.contains(&n.id) {
348                continue;
349            }
350
351            let mut first_id = None;
352            let mut last_id = None;
353
354            for s in n.previous_siblings() {
355                if s.is_block() {
356                    first_id = Some(s.next_sibling().unwrap().id);
357                    break;
358                } else {
359                    known_ids.insert(s.id);
360                }
361            }
362
363            for s in n.next_siblings() {
364                if s.is_block() {
365                    last_id = Some(s.previous_sibling().unwrap().id);
366                    break;
367                } else {
368                    known_ids.insert(s.id);
369                }
370            }
371
372            if first_id.is_some() || last_id.is_some() {
373                let parent = n.parent().unwrap();
374                ids.push([
375                    parent.id,
376                    first_id.unwrap_or_else(|| parent.node.first_child.unwrap()),
377                    last_id.unwrap_or_else(|| parent.node.last_child.unwrap()),
378                ]);
379            }
380        }
381
382        for [parent_id, first_id, last_id] in ids {
383            let offset = self.node(first_id).data.offset();
384            let mut node = self.get_mut(parent_id);
385            node.wrap_range(first_id, last_id, NodeData::Wrapper(offset));
386        }
387    }
388}
389
390impl<'a> NodeRef<'a> {
391    pub fn parent(&self) -> Option<Self> {
392        self.node.parent.map(|id| self.tree.get(id))
393    }
394
395    pub fn parent_element(&self) -> Option<Self> {
396        self.ancestors().find(|n| n.is_element() && !n.is_wrapper())
397    }
398
399    pub fn previous_sibling(&self) -> Option<Self> {
400        self.node.previous_sibling.map(|id| self.tree.get(id))
401    }
402
403    pub fn previous_sibling_element(&self) -> Option<NodeRef<'a>> {
404        self.previous_sibling_elements().next()
405    }
406
407    pub fn next_sibling_element(&self) -> Option<NodeRef<'a>> {
408        self.next_sibling_elements().next()
409    }
410
411    pub fn next_sibling(&self) -> Option<Self> {
412        self.node.next_sibling.map(|id| self.tree.get(id))
413    }
414
415    pub fn first_child(&self) -> Option<Self> {
416        self.node.first_child.map(|id| self.tree.get(id))
417    }
418
419    pub fn last_child(&self) -> Option<Self> {
420        self.node.last_child.map(|id| self.tree.get(id))
421    }
422
423    pub fn ancestors(&self) -> Ancestors<'a> {
424        Ancestors {
425            next: self.parent(),
426        }
427    }
428
429    pub fn ancestor_elements(&self) -> impl Iterator<Item = NodeRef<'a>> {
430        self.ancestors()
431            .filter(|n| n.is_element() && !n.is_wrapper())
432    }
433
434    pub fn previous_siblings(&self) -> PreviousSiblings<'a> {
435        PreviousSiblings {
436            next: self.previous_sibling(),
437        }
438    }
439
440    pub fn next_siblings(&self) -> NextSiblings<'a> {
441        NextSiblings {
442            next: self.next_sibling(),
443        }
444    }
445
446    pub fn previous_sibling_elements(&self) -> impl Iterator<Item = NodeRef<'a>> {
447        self.previous_siblings().filter(|n| n.is_element())
448    }
449
450    pub fn next_sibling_elements(&self) -> impl Iterator<Item = NodeRef<'a>> {
451        self.next_siblings().filter(|n| n.is_element())
452    }
453
454    pub fn children(&self) -> Children<'a> {
455        Children {
456            next: self.first_child(),
457        }
458    }
459
460    pub fn descendants(&self) -> Descendants<'a> {
461        Descendants {
462            root_id: self.id,
463            next: self.first_child(),
464        }
465    }
466
467    pub fn has_children(&self) -> bool {
468        self.node.first_child.is_some()
469    }
470
471    pub fn is_element(&self) -> bool {
472        matches!(
473            self.node.data,
474            NodeData::Element { .. } | NodeData::Wrapper(..) | NodeData::Root
475        )
476    }
477
478    pub fn is_inline(&self) -> bool {
479        match &self.node.data {
480            NodeData::Element(e) => !e.is_block(),
481            NodeData::Text(..) => true,
482            _ => false,
483        }
484    }
485
486    pub fn is_block(&self) -> bool {
487        match &self.node.data {
488            NodeData::Element(e) => e.is_block(),
489            NodeData::Wrapper(..) | NodeData::Root => true,
490            _ => false,
491        }
492    }
493
494    pub fn is_wrapper(&self) -> bool {
495        matches!(self.node.data, NodeData::Wrapper(..))
496    }
497
498    pub fn data(&self) -> &'a NodeData {
499        &self.node.data
500    }
501
502    pub fn offset(&self) -> usize {
503        self.node.data.offset()
504    }
505
506    pub fn text(&self) -> String {
507        self.node.data.text().map(String::from).unwrap_or_else(|| {
508            self.descendants()
509                .filter_map(|n| n.node.data.text())
510                .fold(String::new(), |mut a, b| {
511                    a.push_str(b);
512                    a
513                })
514        })
515    }
516
517    pub fn tag_name(&self) -> Option<&'a str> {
518        match self.node.data {
519            NodeData::Element(ElementData { ref name, .. }) => Some(name),
520            NodeData::Wrapper(..) => Some(WRAPPER_TAG_NAME),
521            _ => None,
522        }
523    }
524
525    pub fn tag_qualified_name(&self) -> Option<&'a str> {
526        match self.node.data {
527            NodeData::Element(ElementData {
528                ref qualified_name, ..
529            }) => qualified_name.as_deref(),
530            _ => None,
531        }
532    }
533
534    pub fn attributes(&self) -> Option<&'a Attributes> {
535        match self.node.data {
536            NodeData::Element(ElementData { ref attributes, .. }) => Some(attributes),
537            _ => None,
538        }
539    }
540
541    pub fn attribute(&self, name: &str) -> Option<&'a str> {
542        self.attributes()
543            .and_then(|a| a.get(name).map(String::as_str))
544    }
545
546    pub fn classes(&self) -> impl Iterator<Item = &'a str> {
547        self.attribute("class").unwrap_or("").split_whitespace()
548    }
549
550    pub fn id(&self) -> Option<&str> {
551        self.attribute("id")
552    }
553
554    pub fn find(&self, tag_name: &str) -> Option<Self> {
555        self.descendants().find(|n| n.tag_name() == Some(tag_name))
556    }
557
558    pub fn find_by_id(&self, id: &str) -> Option<Self> {
559        self.descendants().find(|n| n.id() == Some(id))
560    }
561}
562
563impl<'a> NodeMut<'a> {
564    fn node(&mut self) -> &mut Node {
565        self.tree.node_mut(self.id)
566    }
567
568    pub fn append(&mut self, data: NodeData) -> NodeId {
569        let id = NodeId::from_index(self.tree.nodes.len());
570
571        let node = Node {
572            data,
573            parent: Some(self.id),
574            previous_sibling: self.node().last_child,
575            next_sibling: None,
576            first_child: None,
577            last_child: None,
578        };
579
580        self.tree.nodes.push(node);
581
582        if let Some(last_child) = self.node().last_child {
583            self.tree.node_mut(last_child).next_sibling = Some(id);
584        }
585
586        self.node().last_child = Some(id);
587
588        if self.node().first_child.is_none() {
589            self.node().first_child = Some(id);
590        }
591
592        id
593    }
594
595    pub fn wrap_range(&mut self, first_id: NodeId, last_id: NodeId, data: NodeData) {
596        let before = self.tree.node(first_id).previous_sibling;
597        let after = self.tree.node(last_id).next_sibling;
598        let id = NodeId::from_index(self.tree.nodes.len());
599
600        let node = Node {
601            data,
602            parent: Some(self.id),
603            previous_sibling: before,
604            next_sibling: after,
605            first_child: Some(first_id),
606            last_child: Some(last_id),
607        };
608
609        self.tree.nodes.push(node);
610
611        if let Some(before_id) = before {
612            self.tree.node_mut(before_id).next_sibling = Some(id);
613        }
614
615        if let Some(after_id) = after {
616            self.tree.node_mut(after_id).previous_sibling = Some(id);
617        }
618
619        if let Some(first_child_id) = self.node().first_child {
620            if first_child_id == first_id {
621                self.node().first_child = Some(id);
622            }
623        }
624
625        if let Some(last_child_id) = self.node().last_child {
626            if last_child_id == last_id {
627                self.node().last_child = Some(id);
628            }
629        }
630
631        self.tree.node_mut(first_id).previous_sibling = None;
632        self.tree.node_mut(last_id).next_sibling = None;
633        self.tree.node_mut(first_id).parent = Some(id);
634
635        let mut node_id = first_id;
636        while let Some(next_id) = self.tree.node(node_id).next_sibling {
637            self.tree.node_mut(next_id).parent = Some(id);
638            node_id = next_id;
639        }
640    }
641}
642
643pub struct Ancestors<'a> {
644    next: Option<NodeRef<'a>>,
645}
646
647pub struct NextSiblings<'a> {
648    next: Option<NodeRef<'a>>,
649}
650
651pub struct PreviousSiblings<'a> {
652    next: Option<NodeRef<'a>>,
653}
654
655pub struct Children<'a> {
656    next: Option<NodeRef<'a>>,
657}
658
659pub struct Descendants<'a> {
660    root_id: NodeId,
661    next: Option<NodeRef<'a>>,
662}
663
664impl<'a> Iterator for Ancestors<'a> {
665    type Item = NodeRef<'a>;
666
667    fn next(&mut self) -> Option<Self::Item> {
668        let node = self.next.take();
669        self.next = node.as_ref().and_then(|node| node.parent());
670        node
671    }
672}
673
674impl<'a> Iterator for PreviousSiblings<'a> {
675    type Item = NodeRef<'a>;
676
677    fn next(&mut self) -> Option<Self::Item> {
678        let node = self.next.take();
679        self.next = node.as_ref().and_then(|node| node.previous_sibling());
680        node
681    }
682}
683
684impl<'a> Iterator for NextSiblings<'a> {
685    type Item = NodeRef<'a>;
686
687    fn next(&mut self) -> Option<Self::Item> {
688        let node = self.next.take();
689        self.next = node.as_ref().and_then(|node| node.next_sibling());
690        node
691    }
692}
693
694impl<'a> Iterator for Children<'a> {
695    type Item = NodeRef<'a>;
696
697    fn next(&mut self) -> Option<Self::Item> {
698        let node = self.next.take();
699        self.next = node.as_ref().and_then(|node| node.next_sibling());
700        node
701    }
702}
703
704impl<'a> Iterator for Descendants<'a> {
705    type Item = NodeRef<'a>;
706
707    fn next(&mut self) -> Option<Self::Item> {
708        let node = self.next.take();
709        if let Some(node) = node {
710            self.next = node
711                .first_child()
712                .or_else(|| node.next_sibling())
713                .or_else(|| {
714                    node.ancestors()
715                        .take_while(|n| n.id != self.root_id)
716                        .find(|n| n.node.next_sibling.is_some())
717                        .and_then(|n| n.next_sibling())
718                });
719        }
720        node
721    }
722}