Source code for lookatme.parser

"""
This module defines the parser for the markdown presentation file
"""


import re
from collections import defaultdict
import copy
from typing import cast, AnyStr, Callable, Dict, List, Tuple

import markdown_it
import markdown_it.token

from lookatme.schemas import MetaSchema
from lookatme.slide import Slide
from lookatme.tutorial import tutor
import lookatme.utils as utils


def _set_map_for_inline(token: Dict):
    start = token["map"][0]
    for child in token["children"]:
        child["map"] = [start, start + 1]
        if child["type"] == "softbreak":
            start += 1


[docs]def md_to_tokens(md_text):
    md = markdown_it.MarkdownIt("gfm-like").disable("html_block")
    tokens = md.parse(md_text)
    res = []
    for token_tmp in tokens:
        token = cast(Dict, token_tmp.as_dict())
        if token["type"] in ("heading_open", "heading_close"):
            token["level"] = int(token["tag"].replace("h", ""))
        if token["type"] == "inline":
            _set_map_for_inline(token)
        res.append(token)

    return res


[docs]def is_heading(token):
    return token["type"] == "heading_open"


[docs]def is_hrule(token):
    return token["type"] == "hr"


[docs]class SlideIsolator:
    def __init__(self):
        self.slide_tokens = []
        self.slides = []
        self.number = 0

[docs]    def create_slides(self, tokens, number) -> List[Slide]:
        self.slide_tokens = []
        self.slides: List[Slide] = []
        self.number = number

        self._isolate_progressive_slides(tokens, self.slide_tokens)

        if not self.slides or (
            self.slides and self.slides[-1].tokens != self.slide_tokens
        ):
            self._isolate_slide()

        return self.slides

    def _is_progressive_slide_delimiter(self, token: Dict) -> bool:
        return (
            token["type"] == "html_inline"
            and re.match(r"<!--\s*stop\s*-->", token["content"]) is not None
        )

    def _isolate_progressive_slides(
        self,
        input_tokens: List[Dict],
        output_tokens: List[Dict],
    ):
        """Recursively iterate through the provided input tokens, calling the
        isolate_slide callback whenever a progressive slide token is found. Also
        adding all iterated tokens back into the output_tokens list.
        """
        for token in input_tokens:
            if self._is_progressive_slide_delimiter(token):
                self._isolate_slide()
                continue

            children = token.get("children", None)
            if children is not None:
                # shallow copy here is fine - we only care about zeroing out the
                # children and adding the children back in one-by-one
                output_token = copy.copy(token)
                output_token["children"] = []
                output_tokens.append(output_token)
                self._isolate_progressive_slides(
                    children,
                    output_token["children"],
                )
            else:
                output_tokens.append(token)

    def _isolate_slide(self):
        self.slides.append(Slide(copy.deepcopy(self.slide_tokens), self.number))
        self.number += 1


[docs]class Parser(object):
    """A parser for markdown presentation files"""

    def __init__(self, single_slide=False):
        """Create a new Parser instance"""
        self._single_slide = single_slide

[docs]    def parse(self, input_data) -> Tuple[Dict, List[Slide], str]:
        """Parse the provided input data into a Presentation object

        :param str input_data: The input markdown presentation to parse
        """
        no_meta_input_data, meta = self.parse_meta(input_data)
        slides = self.parse_slides(meta, no_meta_input_data)
        return meta, slides, no_meta_input_data

[docs]    def parse_slides(self, meta, input_data) -> List[Slide]:
        """Parse the Slide out of the input data

        :param dict meta: The parsed meta values
        :param str input_data: The input data string
        :returns: List[Slide]
        """
        tokens = md_to_tokens(input_data)
        utils.debug_print_tokens(tokens)
        num_hrules, hinfo = self._scan_for_smart_split(tokens)
        keep_split_token = True

        if self._single_slide:
            return [Slide(tokens, 0)]

        if num_hrules == 0:
            if meta.get("title", "") in ["", None]:
                meta["title"] = hinfo["title"]

            def slide_split_check(token):  # type: ignore
                nonlocal hinfo
                return is_heading(token) and token["level"] == hinfo["lowest_non_title"]

            def heading_mod(token):  # type: ignore
                nonlocal hinfo
                token["level"] = max(
                    token["level"] - (hinfo["title_level"] or 0),
                    1,
                )

            keep_split_token = True
        else:

            def slide_split_check(token):  # type: ignore
                return is_hrule(token)

            def heading_mod(_):  # type: ignore
                pass

            keep_split_token = False

        slides = self._split_tokens_into_slides(
            tokens, slide_split_check, heading_mod, keep_split_token
        )

        return slides

    def _split_tokens_into_slides(
        self,
        tokens: List[Dict],
        slide_split_check: Callable,
        heading_mod: Callable,
        keep_split_token: bool,
    ) -> List[Slide]:
        """Split the provided tokens into slides using the slide_split_check
        and heading_mod arguments.
        """
        slides = []
        curr_slide_tokens = []
        for token in tokens:
            should_split = slide_split_check(token)
            if is_heading(token):
                heading_mod(token)

            # new slide!
            if should_split:
                if (
                    keep_split_token
                    and len(slides) == 0
                    and len(curr_slide_tokens) == 0
                ):
                    pass
                else:
                    slides.extend(self._create_slides(curr_slide_tokens, len(slides)))
                curr_slide_tokens = []
                if keep_split_token:
                    curr_slide_tokens.append(token)
                continue
            else:
                curr_slide_tokens.append(token)

        slides.extend(self._create_slides(curr_slide_tokens, len(slides)))

        return slides

    def _get_heading_contents(self, tokens, start_idx):
        num_heading_opens = 0
        res = []
        for token in tokens[start_idx:]:
            if token["type"] == "heading_open":
                num_heading_opens += 1
            elif token["type"] == "heading_close":
                num_heading_opens -= 1
                if num_heading_opens == 0:
                    break
            else:
                res.append(token)
        return res

    @tutor(
        "general",
        "slides splitting",
        r"""
        Slides can be:

        ## Separated by horizontal rules (three or more `*`, `-`, or `_`)

        ```markdown
        slide 1
        ***
        slide 2
        ```

        ## Split using existing headings ("smart" splitting)

        ```markdown
        # Slide 1

        # Slide 2
        ```

        ## Rendered as a single slide with the `--single` or `--one` CLI parameter

        ```bash
        lookatme --single content.md
        ```
        """,
        order=2,
    )
    def _scan_for_smart_split(self, tokens):
        """Scan the provided tokens for the number of hrules, and the lowest
        (h1 < h2) header level.

        :returns: tuple (num_hrules, lowest_header_level)
        """
        hinfo = {
            "title_level": None,
            "lowest_non_title": 10,
            "counts": defaultdict(int),
            "title": "",
        }
        num_hrules = 0
        first_heading = None
        first_heading_contents = None
        for idx, token in enumerate(tokens):
            if is_hrule(token):
                num_hrules += 1
            elif is_heading(token):
                hinfo["counts"][token["level"]] += 1
                if first_heading is None:
                    first_heading = token
                    first_heading_contents = self._get_heading_contents(tokens, idx)

        # started off with the lowest heading, make this title
        if (
            hinfo["counts"]
            and first_heading
            and isinstance(first_heading_contents, list)
            and hinfo["counts"][first_heading["level"]] == 1
        ):
            map_start = first_heading_contents[0]["map"]
            map_end = first_heading_contents[-1]["map"]
            hinfo["title"] = (
                [{"type": "paragraph_open", "map": [map_start[0], map_start[0] + 1]}]
                + first_heading_contents
                + [{"type": "paragraph_close", "map": [map_end[-1] - 1, map_end[-1]]}]
            )
            del hinfo["counts"][first_heading["level"]]
            hinfo["title_level"] = first_heading["level"]

        low_level = min(list(hinfo["counts"].keys()) + [10])
        hinfo["title_level"] = low_level - 1
        hinfo["lowest_non_title"] = low_level

        return num_hrules, hinfo

[docs]    @tutor(
        "general",
        "metadata",
        r"""
        The YAML metadata that can be prefixed in slides includes these top level
        fields:

        ```yaml
        ---
        title: "title"
        date: "date"
        author: "author"
        extensions:
          - extension 1
          # .. list of extensions
        styles:
          # .. nested style fields ..
        ---
        ```

        > **NOTE** The `styles` field will be explained in detail with each markdown
        > element.
        """,
        order=3,
    )
    def parse_meta(self, input_data) -> Tuple[AnyStr, Dict]:
        """Parse the PresentationMeta out of the input data

        :param str input_data: The input data string
        :returns: tuple of (remaining_data, meta)
        """
        found_first = False
        yaml_data = []
        skipped_chars = 0
        for line in input_data.split("\n"):
            skipped_chars += len(line) + 1
            stripped_line = line.strip()

            is_marker = re.match(r"----*", stripped_line) is not None
            if is_marker:
                if not found_first:
                    found_first = True
                # found the second one
                else:
                    break

            if found_first and not is_marker:
                yaml_data.append(line)
                continue

            # there was no ----* marker
            if not found_first and stripped_line != "":
                break

        if not found_first:
            return input_data, MetaSchema().load_partial_styles({}, partial=True)

        new_input = input_data[skipped_chars:]
        if len(yaml_data) == 0:
            return new_input, MetaSchema().load_partial_styles({}, partial=True)

        yaml_data = "\n".join(yaml_data)
        data = MetaSchema().loads_partial_styles(yaml_data, partial=True)
        return new_input, data

    @tutor(
        "general",
        "progressive slides",
        r"""
        Slides can be progressively displayed by inserting `<!-- stop -->`
        comments anywhere in the markdown.

        <TUTOR:EXAMPLE>
        This will display first, and after you press advance ...<!-- stop -->

        * this <!-- stop -->
          * displays <!-- stop -->

        | and <!-- stop --> | then <!-- stop -->     |
        |-------------------|------------------------|
        | this              | and this               |

        <!-- stop -->

        and finally this!
        </TUTOR:EXAMPLE>
        """,
        order=2,
    )
    def _create_slides(self, tokens, number):
        """Create additional slides from the provided token stream, splitting
        wherever progressive slide markers are found.
        """
        slide_isolator = SlideIsolator()
        return slide_isolator.create_slides(tokens, number)