Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- #
- # Markdown2BBCode.pl -- A text-to-BBCode conversion tool for ProBoards users
- #
- # Derived from Markdown.pl -- A text-to-HTML conversion tool
- # Copyright (c) 2004 John Gruber
- # <http://daringfireball.net/projects/markdown/>
- #
- # Additional customizations for BBCode Written by Krusader74
- # http://odd74.proboards.com/user/2141
- # http://pastebin.com/u/krusader74
- # 26 Jan 2016
- # @ Copyleft All Wrongs Reserved
- package Markdown;
- use strict;
- use warnings;
- use Digest::MD5 qw(md5_hex);
- use vars qw($VERSION);
- $VERSION = '3.0';
- use Switch;
- use YAML::Tiny;
- use Games::Dice 'roll';
- use utf8;
- use feature 'unicode_strings';
- use open qw(:std :utf8);
- binmode(STDIN, ":utf8");
- binmode(STDOUT, ":utf8");
- binmode(STDERR, ":utf8");
- #
- # Global default settings:
- #
- my $g_empty_element_suffix = "⟧";
- my $g_tab_width = 4;
- # Because Markdown and BBCode both use square brackets [, ] heavily
- # we are going to use a trick and encode all BBCode using double square brackets ⟦, ⟧
- # until we get to the very end when we'll turn them back into regular square brackets.
- #
- # Globals:
- #
- # Regexen to match balanced [brackets], (parens), {braces} and <chevrons>. See Friedl's
- # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
- my $g_nested_brackets;
- $g_nested_brackets = qr{
- (?> # Atomic matching
- [^\[\]]+ # Anything other than brackets
- |
- \[
- (??{ $g_nested_brackets }) # Recursive set of nested brackets
- \]
- )*
- }x;
- my $g_nested_parens;
- $g_nested_parens = qr{
- (?> # Atomic matching
- [^\(\)]+ # Anything other than parens
- |
- \(
- (??{ $g_nested_parens }) # Recursive set of nested parens
- \)
- )*
- }x;
- my $g_nested_braces;
- $g_nested_braces = qr{
- (?> # Atomic matching
- [^\{\}]+ # Anything other than braces
- |
- \{
- (??{ $g_nested_braces }) # Recursive set of nested braces
- \}
- )*
- }x;
- my $g_nested_chevrons;
- $g_nested_chevrons = qr{
- (?> # Atomic matching
- [^\<\>]+ # Anything other than chevrons
- |
- \<
- (??{ $g_nested_chevrons }) # Recursive set of nested chevrons
- \>
- )*
- }x;
- # Table of hash values for escaped characters:
- my %g_escape_table;
- foreach my $char (split //, '\\`*_{}[]()>#+-.!~') {
- $g_escape_table{$char} = md5_hex($char);
- }
- # Global hashes, used by various utility routines
- my %g_urls;
- my %g_titles;
- my %g_html_blocks;
- my %g_footnotes;
- my $yaml;
- my @matharr;
- my $mathctr;
- my %mathsym;
- my %mathcal;
- my %mathbb;
- my %dice;
- # Used to track when we're inside an ordered or unordered list
- # (see _ProcessListItems() for details):
- my $g_list_level = 0;
- #### Check for command-line switches: #################
- my %cli_opts;
- use Getopt::Long;
- Getopt::Long::Configure('pass_through');
- GetOptions(\%cli_opts,
- 'version',
- 'shortversion',
- );
- if ($cli_opts{'version'}) { # Version info
- print "\nThis is Markdown2BBcode, version $VERSION.\n";
- print "Original Markdown script Copyright 2004 John Gruber\n";
- print "http://daringfireball.net/projects/markdown/\n\n";
- print "New support for BBCode @ CopyLeft 2016 Krusader74\n";
- print "http://pastebin.com/u/krusader74\n\n";
- exit 0;
- }
- if ($cli_opts{'shortversion'}) { # Just the version number string.
- print $VERSION;
- exit 0;
- }
- #### Process incoming text: ###########################
- my $text;
- {
- local $/; # Slurp the whole file
- $text = <>;
- }
- print Markdown($text);
- sub Markdown {
- #
- # Main function. The order in which other subs are called here is
- # essential. Link and image substitutions need to happen before
- # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
- # and <img> tags get encoded.
- #
- my $text = shift;
- # Clear the global hashes. If we don't clear these, you get conflicts
- # from other articles when generating a page which contains more than
- # one article (e.g. an index page that shows the N most recent
- # articles):
- %g_urls = ();
- %g_titles = ();
- %g_html_blocks = ();
- %g_footnotes = ();
- _InitMath();
- $text = _PreProcess($text);
- # Standardize line endings:
- $text =~ s{\r\n}{\n}g; # DOS to Unix
- $text =~ s{\r}{\n}g; # Mac to Unix
- # Make sure $text ends with a couple of newlines:
- $text .= "\n\n";
- # Convert all tabs to spaces.
- $text = _Detab($text);
- # Strip any lines consisting only of spaces and tabs.
- # This makes subsequent regexen easier to write, because we can
- # match consecutive blank lines with /\n+/ instead of something
- # contorted like /[ \t]*\n+/ .
- $text =~ s/^[ \t]+$//mg;
- # Parse the YAML header first, if there is one. Then process metadata variables
- $text = _InitYAML($text);
- # Turn LaTeX math spans into array entries
- $text = _HashMathSpans($text);
- # Turn block-level HTML blocks into hash entries
- $text = _HashBBCodeBlocks($text);
- # Strip link definitions, store in hashes.
- $text = _StripLinkDefinitions($text);
- $text = _StripFootnotes($text);
- $text = _RunBlockGamut($text);
- $text = _UnescapeSpecialChars($text);
- $text = _PostProcess($text);
- return $text . "\n";
- }
- sub _StripFootnotes {
- #
- # Strips footnotes from text, stores the URLs and titles in
- # hash references.
- #
- my $text = shift;
- my $less_than_tab = $g_tab_width - 1;
- my $footnotes;
- # Footnotes are in the form: [^id]: text of note
- while ($text =~ s{
- ( # whole footnote = $1
- ^
- [ ]{0,$less_than_tab}
- (?:
- \[ \^ ( .+? ) \] : # id = $2
- )
- [ \t]*
- ( # note = $3
- (?: .+? )
- (?: \n\n? [ ]{4} .+? )*
- )
- (
- \z
- |
- \n{2,}
- )
- )
- }{}mx) {
- $g_footnotes{lc $2} = $3;
- }
- my $count = keys %g_footnotes;
- if($count > 0){
- $footnotes = "⟦hr⟧\n";
- foreach my $key (sort(keys(%g_footnotes))) {
- $footnotes .= "⟦" . $key . "⟧: " .
- _RunBlockGamut(_Outdent($g_footnotes{$key} . "\n\n" )) . "\n\n";
- }
- }
- return $text . $footnotes;
- }
- sub _StripLinkDefinitions {
- #
- # Strips link definitions from text, stores the URLs and titles in
- # hash references.
- #
- my $text = shift;
- my $less_than_tab = $g_tab_width - 1;
- # Link defs are in the form: ^[id]: url "optional title"
- while ($text =~ s{
- ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
- [ \t]*
- \n? # maybe *one* newline
- [ \t]*
- <?(\S+?)>? # url = $2
- [ \t]*
- \n? # maybe one newline
- [ \t]*
- (?:
- (?<=\s) # lookbehind for whitespace
- ["(]
- (.+?) # title = $3
- [")]
- [ \t]*
- )? # title is optional
- (?:\n+|\Z)
- }
- {}mx) {
- $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
- if ($3) {
- $g_titles{lc $1} = $3;
- $g_titles{lc $1} =~ s/"/"/g;
- }
- }
- return $text;
- }
- sub _HashBBCodeBlocks {
- my $text = shift;
- my $less_than_tab = $g_tab_width - 1;
- # Hashify HTML blocks:
- # We only want to do this for block-level HTML tags, such as headers,
- # lists, and tables. That's because we still want to wrap <p>s around
- # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
- # phrase emphasis, and spans. The list of tags we're looking for is
- # hard-coded:
- my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
- my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
- # First, look for nested blocks, e.g.:
- # <div>
- # <div>
- # tags for inner block must be indented.
- # </div>
- # </div>
- #
- # The outermost tags must start at the left margin for this to match, and
- # the inner nested divs must be indented.
- # We need to do this before the next, more liberal match, because the next
- # match will start at the first `<div>` and stop at the first `</div>`.
- $text =~ s{
- ( # save in $1
- ^ # start of line (with /m)
- \[($block_tags_a) # start tag = $2
- \b # word break
- (.*\n)*? # any number of lines, minimally matching
- \[/\2\] # the matching end tag
- [ \t]* # trailing spaces/tabs
- (?=\n+|\Z) # followed by a newline or end of document
- )
- }{
- my $key = md5_hex($1);
- $g_html_blocks{$key} = $1;
- "\n\n" . $key . "\n\n";
- }egmx;
- #
- # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
- #
- $text =~ s{
- ( # save in $1
- ^ # start of line (with /m)
- \[($block_tags_b) # start tag = $2
- \b # word break
- (.*\n)*? # any number of lines, minimally matching
- .*\[/\2\] # the matching end tag
- [ \t]* # trailing spaces/tabs
- (?=\n+|\Z) # followed by a newline or end of document
- )
- }{
- my $key = md5_hex($1);
- $g_html_blocks{$key} = $1;
- "\n\n" . $key . "\n\n";
- }egmx;
- # Special case just for <hr />. It was easier to make a special case than
- # to make the other regex more complicated.
- $text =~ s{
- (?:
- (?<=\n\n) # Starting after a blank line
- | # or
- \A\n? # the beginning of the doc
- )
- ( # save in $1
- [ ]{0,$less_than_tab}
- \[(hr) # start tag = $2
- \b # word break
- ([^\[\]])*? #
- /?\] # the matching end tag
- [ \t]*
- (?=\n{2,}|\Z) # followed by a blank line or end of document
- )
- }{
- my $key = md5_hex($1);
- $g_html_blocks{$key} = $1;
- "\n\n" . $key . "\n\n";
- }egx;
- # Special case for standalone HTML comments:
- $text =~ s{
- (?:
- (?<=\n\n) # Starting after a blank line
- | # or
- \A\n? # the beginning of the doc
- )
- ( # save in $1
- [ ]{0,$less_than_tab}
- (?s:
- <!
- (--.*?--\s*)+
- >
- )
- [ \t]*
- (?=\n{2,}|\Z) # followed by a blank line or end of document
- )
- }{}gx;
- return $text;
- }
- sub _RunBlockGamut {
- #
- # These are all the transformations that form block-level
- # tags like paragraphs, headers, and list items.
- #
- my $text = shift;
- $text = _DoMetadataVariables($text);
- $text = _DoTables($text);
- $text = _DoHeaders($text);
- $text = _DoFootnotes($text);
- $text = _DoLists($text);
- $text = _DoCodeBlocks($text);
- $text = _DoBlockQuotes($text);
- $text = _DoHorizontalRules($text);
- # We already ran _HashBBCodeBlocks() before, in Markdown(), but that
- # was to escape raw HTML in the original Markdown source. This time,
- # we're escaping the markup we've just created, so that we don't wrap
- # <p> tags around block-level tags.
- $text = _HashBBCodeBlocks($text);
- $text = _FormParagraphs($text);
- return $text;
- }
- sub _RunSpanGamut {
- #
- # These are all the transformations that occur *within* block-level
- # tags like paragraphs, headers, and list items.
- #
- my $text = shift;
- $text = _DoCodeSpans($text);
- $text = _EscapeSpecialChars($text);
- # Process anchor and image tags. Images must come first,
- # because ![foo][f] looks like an anchor.
- $text = _DoAtCommands($text);
- $text = _DoImages($text);
- $text = _DoAnchors($text);
- # Make links out of things like `<http://example.com/>`
- # Must come after _DoAnchors(), because you can use < and >
- # delimiters in inline links like [this](<url>).
- $text = _DoAutoLinks($text);
- $text = _EncodeAmpsAndAngles($text);
- $text = _DoItalicsAndBold($text);
- # WAS: Do hard breaks:
- # $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g;
- $text =~ s/ {2,}\n/ \n\n/g;
- return $text;
- }
- sub _EscapeSpecialChars {
- my $text = shift;
- my $tokens ||= _TokenizeHTML($text);
- $text = ''; # rebuild $text from the tokens
- # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
- # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
- foreach my $cur_token (@$tokens) {
- if ($cur_token->[0] eq "tag") {
- # Within tags, encode * and _ so they don't conflict
- # with their use in Markdown for italics and strong.
- # We're replacing each such character with its
- # corresponding MD5 checksum value; this is likely
- # overkill, but it should prevent us from colliding
- # with the escape values by accident.
- $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx;
- $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx;
- $cur_token->[1] =~ s! ~ !$g_escape_table{'~'}!gx;
- $text .= $cur_token->[1];
- } else {
- my $t = $cur_token->[1];
- $t = _EncodeBackslashEscapes($t);
- $text .= $t;
- }
- }
- return $text;
- }
- sub _DoHorizontalRules {
- my $text = shift;
- $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n⟦hr⟧\n}gmx;
- $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n⟦hr⟧\n}gmx;
- $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n⟦hr⟧\n}gmx;
- return $text;
- }
- sub _DoAnchors {
- #
- # Turn Markdown link shortcuts into [a] tags.
- #
- my $text = shift;
- #
- # First, handle reference-style links: [link text] [id]
- #
- $text =~ s{
- ( # wrap whole match in $1
- \[
- ($g_nested_brackets) # link text = $2
- \]
- [ ]? # one optional space
- (?:\n[ ]*)? # one optional newline followed by spaces
- \[
- (.*?) # id = $3
- \]
- )
- }{
- my $result;
- my $whole_match = $1;
- my $link_text = $2;
- my $link_id = lc $3;
- if ($link_id eq "") {
- $link_id = lc $link_text; # for shortcut links like [this][].
- }
- if (defined $g_urls{$link_id}) {
- my $url = $g_urls{$link_id};
- $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
- $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
- $url =~ s! ~ !$g_escape_table{'~'}!gx; # conflicting with italics/bold.
- $result = "⟦a href=\"$url\"";
- if ( defined $g_titles{$link_id} ) {
- my $title = $g_titles{$link_id};
- $title =~ s! \* !$g_escape_table{'*'}!gx;
- $title =~ s! _ !$g_escape_table{'_'}!gx;
- $title =~ s! ~ !$g_escape_table{'~'}!gx;
- $result .= " title=\"$title\"";
- }
- $result .= "⟧$link_text⟦/a⟧";
- }
- else {
- $result = $whole_match;
- }
- $result;
- }xsge;
- #
- # Next, inline-style links: [link text](url "optional title")
- #
- $text =~ s{
- ( # wrap whole match in $1
- \[
- ($g_nested_brackets) # link text = $2
- \]
- \( # literal paren
- [ \t]*
- <?(.*?)>? # href = $3
- [ \t]*
- ( # $4
- (['"]) # quote char = $5
- (.*?) # Title = $6
- \5 # matching quote
- )? # title is optional
- \)
- )
- }{
- my $result;
- my $whole_match = $1;
- my $link_text = $2;
- my $url = $3;
- my $title = $6;
- $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
- $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
- $url =~ s! ~ !$g_escape_table{'~'}!gx; # conflicting with italics/bold.
- $result = "⟦a href=\"$url\"";
- if (defined $title) {
- $title =~ s/"/"/g;
- $title =~ s! \* !$g_escape_table{'*'}!gx;
- $title =~ s! _ !$g_escape_table{'_'}!gx;
- $title =~ s! ~ !$g_escape_table{'~'}!gx;
- $result .= " title=\"$title\"";
- }
- $result .= "⟧$link_text⟦/a⟧";
- $result;
- }xsge;
- return $text;
- }
- sub _DoFootnotes {
- my $text = shift;
- # [^id]
- $text =~ s{
- ( # wrap whole match in $1
- \[ \^
- (.*?) # id = $2
- \]
- )
- }{
- my $key = lc $2;
- "⟦sup⟧⟦" . $key . "⟧⟦/sup⟧";
- }xsge;
- return $text;
- }
- sub _DoAtCommands {
- #
- # Turn @-command shortcuts into BBCode tags.
- #
- my $text = shift;
- # @[command](argument)
- $text =~ s{
- ( # wrap whole match in $1
- @ \[
- (.*?) # command = $2
- \]
- \( # literal paren
- ($g_nested_parens) # argument = $3
- \)
- )
- }{
- my $command = $2;
- my $argument = $3;
- # Check for nested at commands
- if($command !~ /nomarkdown/i && $argument =~ /@\[/) { $argument = _DoAtCommands($argument) }
- _ParseAtCommand($command, $argument);
- }xsge;
- return $text;
- }
- sub _ParseAtCommand {
- my $result;
- my $command = shift;
- my $argument = shift;
- switch ($command) {
- case /eval/i { return eval $argument }
- case /roll/i { return roll($argument) }
- case /noprocess/i { return "@⟦noprocess⟧(${argument})" }
- case /process/i { return "@⟦process⟧(${argument})" }
- case /nomarkdown/i { $matharr[$mathctr] = ${argument}; return "(※" . $mathctr++ . ")" }
- case /comment/i { return "" }
- case /youtube/i { return "⟦video⟧http://www.youtube.com/watch?v=${argument}⟦/video⟧" }
- case /video/i { return "⟦video⟧http://www.youtube.com/watch?v=${argument}⟦/video⟧" }
- case /imgur/i { return "⟦img⟧http://i.imgur.com/${argument}⟦/img⟧" }
- case /twitter/i { return "⟦twitter id=\"${argument}\"⟧" }
- case /tweet/i { return "⟦twitter id=\"${argument}\"⟧" }
- case /marquee/i { return "⟦marquee scrollamount=\"2\"⟧${argument}⟦/marquee⟧" }
- case /move/i { return "⟦marquee scrollamount=\"2\"⟧${argument}⟦/marquee⟧" }
- case /noparse/i { return "⟦noparse⟧${argument}⟦/noparse⟧" }
- case /blockquote/i { return "⟦blockquote⟧${argument}⟦/blockquote⟧" }
- case /quote/i { return "⟦quote⟧${argument}⟦/quote⟧" }
- case /biggest/i { return "⟦font size=\"7\"⟧${argument}⟦/font⟧" }
- case /bigger/i { return "⟦font size=\"6\"⟧${argument}⟦/font⟧" }
- case /big/i { return "⟦font size=\"5\"⟧${argument}⟦/font⟧" }
- case /smallest/i { return "⟦font size=\"1\"⟧${argument}⟦/font⟧" }
- case /smaller/i { return "⟦font size=\"2\"⟧${argument}⟦/font⟧" }
- case /small/i { return "⟦font size=\"3\"⟧${argument}⟦/font⟧" }
- case /spoilers?/i { return "⟦spoiler⟧${argument}⟦/spoiler⟧" }
- case /(red|yellow|pink|green|purple|orange|blue)/i { my $color = lc $command; return "⟦font color=\"${color}\"⟧${argument}⟦/font⟧" }
- case /hide/i { return "⟦font color=\"efefef\"⟧${argument}⟦/color⟧" }
- case /courier/i { return "⟦font face=\"courier new\"⟧${argument}⟦/font⟧" }
- case /rot13/i { $_ = $argument; tr/A-Za-z/N-ZA-Mn-za-m/; return $_ }
- case /reverse/i { return reverse $argument }
- case /left/i { return "⟦div align=\"left\"⟧${argument}⟦/div⟧" }
- case /right/i { return "⟦div align=\"right\"⟧${argument}⟦/div⟧" }
- case /center/i { return "⟦div align=\"center\"⟧${argument}⟦/div⟧" }
- case /justify/i { return "⟦div align=\"justify\"⟧${argument}⟦/div⟧" }
- case /code/i { return "⟦code⟧${argument}⟦/code⟧" }
- case /pre/i { return "⟦pre⟧${argument}⟦/pre⟧" }
- case /sub/i { return "⟦sub⟧${argument}⟦/sub⟧" }
- case /sup/i { return "⟦sup⟧${argument}⟦/sup⟧" }
- case /hi/i { return "⟦highlight⟧${argument}⟦/highlight⟧" }
- case /tt/i { return "⟦tt⟧${argument}⟦/tt⟧" }
- case /b/i { return "⟦b⟧${argument}⟦/b⟧" }
- case /i/i { return "⟦i⟧${argument}⟦/i⟧" }
- case /s/i { return "⟦s⟧${argument}⟦/s⟧" }
- case /u/i { return "⟦u⟧${argument}⟦/u⟧" }
- else { return "failed to parse at-command ${command} with argument ${argument})" }
- }
- }
- sub _DoImages {
- #
- # Turn Markdown image shortcuts into <img> tags.
- #
- my $text = shift;
- #
- # First, handle reference-style labeled images: ![alt text][id]
- #
- $text =~ s{
- ( # wrap whole match in $1
- !\[
- (.*?) # alt text = $2
- \]
- [ ]? # one optional space
- (?:\n[ ]*)? # one optional newline followed by spaces
- \[
- (.*?) # id = $3
- \]
- )
- }{
- my $result;
- my $whole_match = $1;
- my $alt_text = $2;
- my $link_id = lc $3;
- if ($link_id eq "") {
- $link_id = lc $alt_text; # for shortcut links like ![this][].
- }
- $alt_text =~ s/"/"/g;
- if (defined $g_urls{$link_id}) {
- my $url = $g_urls{$link_id};
- $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
- $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
- $url =~ s! ~ !$g_escape_table{'~'}!gx; # conflicting with italics/bold.
- $result = "⟦img src=\"$url\" alt=\"$alt_text\"";
- if (defined $g_titles{$link_id}) {
- my $title = $g_titles{$link_id};
- $title =~ s! \* !$g_escape_table{'*'}!gx;
- $title =~ s! _ !$g_escape_table{'_'}!gx;
- $title =~ s! ~ !$g_escape_table{'~'}!gx;
- $result .= " title=\"$title\"";
- }
- $result .= $g_empty_element_suffix;
- }
- else {
- # If there's no such link ID, leave intact:
- $result = $whole_match;
- }
- $result;
- }xsge;
- #
- # Next, handle inline images: ![alt text](url "optional title")
- # Don't forget: encode * and _
- $text =~ s{
- ( # wrap whole match in $1
- !\[
- (.*?) # alt text = $2
- \]
- \( # literal paren
- [ \t]*
- <?(\S+?)>? # src url = $3
- [ \t]*
- ( # $4
- (['"]) # quote char = $5
- (.*?) # title = $6
- \5 # matching quote
- [ \t]*
- )? # title is optional
- \)
- )
- }{
- my $result;
- my $whole_match = $1;
- my $alt_text = $2;
- my $url = $3;
- my $title = '';
- if (defined($6)) {
- $title = $6;
- }
- $alt_text =~ s/"/"/g;
- $title =~ s/"/"/g;
- $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
- $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
- $url =~ s! ~ !$g_escape_table{'~'}!gx; # conflicting with italics/bold.
- $result = "⟦img src=\"$url\" alt=\"$alt_text\"";
- if (defined $title) {
- $title =~ s! \* !$g_escape_table{'*'}!gx;
- $title =~ s! _ !$g_escape_table{'_'}!gx;
- $title =~ s! ~ !$g_escape_table{'~'}!gx;
- $result .= " title=\"$title\"";
- }
- $result .= $g_empty_element_suffix;
- $result;
- }xsge;
- return $text;
- }
- sub _DoHeaders {
- my $text = shift;
- my @h_start = ("", "⟦font size=\"6\"⟧⟦b⟧", "⟦font size=\"5\"⟧⟦b⟧", "⟦font size=\"5\"⟧⟦b⟧⟦i⟧", "⟦font size=\"4\"⟧⟦b⟧", "⟦font size=\"4\"⟧⟦b⟧⟦i⟧", "⟦font size=\"4\"⟧⟦i⟧");
- my @h_end = ("", "⟦/b⟧⟦/font⟧", "⟦/b⟧⟦/font⟧", "⟦/i⟧⟦/b⟧⟦/font⟧", "⟦/b⟧⟦/font⟧", "⟦/i⟧⟦/b⟧⟦/font⟧", "⟦/i⟧⟦/font⟧");
- # Setext-style headers:
- # Header 1
- # ========
- #
- # Header 2
- # --------
- #
- $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
- $h_start[1] . _RunSpanGamut($1) . $h_end[1] . "\n\n";
- }egmx;
- $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
- $h_start[2] . _RunSpanGamut($1) . $h_end[2] . "\n\n";
- }egmx;
- # atx-style headers:
- # # Header 1
- # ## Header 2
- # ## Header 2 with closing hashes ##
- # ...
- # ###### Header 6
- #
- $text =~ s{
- ^(\#{1,6}) # $1 = string of #'s
- [ \t]*
- (.+?) # $2 = Header text
- [ \t]*
- \#* # optional closing #'s (not counted)
- \n+
- }{
- my $h_level = length($1);
- $h_start[$h_level] . _RunSpanGamut($2) . $h_end[$h_level] . "\n\n";
- }egmx;
- return $text;
- }
- sub _DoTables {
- #
- # Form BBCode tables
- #
- my $text = shift;
- my $less_than_tab = $g_tab_width - 1;
- #
- # Multiline table with headers
- #
- my $re_table_demarcation = qr{ ^ [ \t]* -{3,} \n }mx;
- my $re_table_divider_column = qr{ -{3,} }mx;
- my $re_table_divider = qr{
- ^
- [ \t]*
- ${re_table_divider_column} # Divider Head
- (?:[ \t]+ ${re_table_divider_column})+ # Divider Tail
- \n
- }mx;
- my $re_table_row = qr{
- (?:[^\n]+\n)+ # One or more non-blank lines
- (?!$re_table_demarcation)
- }mx;
- my $re_table_rows = qr{
- (?: ${re_table_row} \n)*
- (?:[^\n]+\n)+ # One or more non-blank lines
- (?=$re_table_demarcation)
- }mx;
- my $re_header_row = qr{
- $re_table_row
- (?=$re_table_divider)
- }mx;
- my $re_table = qr{
- ( # $1 = whole table
- $re_table_demarcation # row of dashes, before the header text
- ($re_header_row) # $2 = header row
- ($re_table_divider) # $3 = table divider
- (${re_table_rows}) # $4 = rows in table body
- $re_table_demarcation # end with a row of dashes
- (?:\z|\n) # then a blank line
- )
- }mx;
- $text =~ s{$re_table}{
- my $header = $2;
- my $divider = $3;
- my $rows = $4;
- my $bbc = "";
- my @rows = split(/\n\n/, $rows);
- my @align = get_row_alignments($header, $divider);
- for my $r (@rows) {
- my $c = 0;
- $bbc .= "⟦tr⟧" .
- join('', map {'⟦td align="' . $align[$c++] . '"⟧' . _RunBlockGamut(_Outdent($_)) . '⟦/td⟧'} get_fixed_width_row($r, $divider)) .
- "⟦/tr⟧\n";
- }
- $bbc = "⟦table⟧⟦thead⟧⟦tr⟧" . join('', map {'⟦th⟧' . $_ . '⟦/th⟧'} get_fixed_width_row($header, $divider)) . "⟦/tr⟧⟦/thead⟧\n" .
- "⟦tbody⟧" . $bbc . "⟦/tbody⟧⟦/table⟧\n\n";
- $bbc;
- }egmx;
- #
- # Multiline table without headers
- #
- $re_table_row = qr{
- (?:[^\n]+\n)+ # One or more non-blank lines
- (?!$re_table_divider)
- }mx;
- $re_table_rows = qr{
- (?: ${re_table_row} \n)+
- (?:[^\n]+\n)+ # One or more non-blank lines
- (?=$re_table_divider)
- }mx;
- my $re_table_wo_headers = qr{
- ( # $1 = whole table
- ($re_table_divider) # $2 = table divider
- (${re_table_rows}) # $3 = rows in table body
- $re_table_divider # table divider
- (?:\z|\n) # then a blank line
- )
- }mx;
- $text =~ s{$re_table_wo_headers}{
- my $divider = $2;
- my $rows = $3;
- my $bbc = "";
- my @rows = split(/\n\n/, $rows);
- my @align = get_row_alignments($rows[0], $divider);
- for my $r (@rows) {
- my $c = 0;
- $bbc .= "⟦tr⟧" .
- join('', map {'⟦td align="' . $align[$c++] . '"⟧' . _RunBlockGamut(_Outdent($_)) . '⟦/td⟧'} get_fixed_width_row($r, $divider)) .
- "⟦/tr⟧\n";
- }
- $bbc = "⟦table⟧⟦tbody⟧" . $bbc . "⟦/tbody⟧⟦/table⟧\n\n";
- $bbc;
- }egmx;
- #
- # Simple table with headers
- #
- $re_header_row = qr{
- [^\n]+ \n # One non-blank line
- (?=$re_table_divider)
- }mx;
- $re_table_row = qr{
- [^\n]+ \n # One non-blank line
- (?!$re_table_divider)
- }mx;
- $re_table_rows = qr{
- (?:${re_table_row})+
- (?=\n|\z)
- }mx;
- my $re_simple_table = qr{
- ( # $1 = whole table
- ($re_header_row) # $2 = header row
- ($re_table_divider) # $3 = table divider
- (${re_table_rows}) # $4 = rows in table body
- (?:\z|\n) # then a blank line
- )
- }mx;
- $text =~ s{$re_simple_table}{
- my $header = $2;
- my $divider = $3;
- my $rows = $4;
- my $bbc = "";
- my @rows = split(/\n/, $rows);
- my @align = get_row_alignments($header, $divider);
- for my $r (@rows) {
- my $c = 0;
- $bbc .= "⟦tr⟧" .
- join('', map {'⟦td align="' . $align[$c++] . '"⟧' . $_ . '⟦/td⟧'} get_fixed_width_row($r, $divider)) .
- "⟦/tr⟧\n";
- }
- $bbc = "⟦table⟧⟦thead⟧⟦tr⟧" . join('', map {'⟦th⟧' . $_ . '⟦/th⟧'} get_fixed_width_row($header, $divider)) . "⟦/tr⟧⟦/thead⟧\n" .
- "⟦tbody⟧" . $bbc . "⟦/tbody⟧⟦/table⟧\n\n";
- $bbc;
- }egmx;
- #
- # Simple table without headers
- #
- $re_table_row = qr{
- [^\n]+ \n # One non-blank line
- (?!$re_table_divider)
- }mx;
- $re_table_rows = qr{
- (?:${re_table_row})*
- [^\n]+\n # One non-blank line
- (?=$re_table_divider)
- }mx;
- my $re_simple_table_wo_headers = qr{
- ( # $1 = whole table
- ($re_table_divider) # $2 = table divider
- (${re_table_rows}) # $3 = rows in table body
- $re_table_divider # table divider
- (?:\z|\n) # then a blank line
- )
- }mx;
- $text =~ s{$re_simple_table_wo_headers}{
- my $divider = $2;
- my $rows = $3;
- my $bbc = "";
- my @rows = split(/\n/, $rows);
- my @align = get_row_alignments($rows[0], $divider);
- for my $r (@rows) {
- my $c = 0;
- $bbc .= "⟦tr⟧" .
- join('', map {'⟦td align="' . $align[$c++] . '"⟧' . $_ . '⟦/td⟧'} get_fixed_width_row($r, $divider)) .
- "⟦/tr⟧\n";
- }
- $bbc = "⟦table⟧⟦tbody⟧" . $bbc . "⟦/tbody⟧⟦/table⟧\n\n";
- $bbc;
- }egmx;
- #
- # Grid tables
- #
- my $re_row_demarcation = qr{ ^ [ \t]* \+ (?:-{3,}\+)+ \n }mx;
- my $re_table_divider_row = qr{ ^ [ \t]* \+ (?:\={3,}\+)+ \n }mx;
- $re_table_row = qr{
- (?: \| (?:[^\|]+ \|)+ \n)+ # One or more non-blank lines
- (?=$re_row_demarcation)
- }mx;
- $re_table_rows = qr{
- $re_table_row
- $re_row_demarcation
- }mx;
- $re_header_row = qr{
- (?: \| (?:[^\|]+ \|)+ \n)+ # One or more non-blank lines
- (?=$re_table_divider_row)
- }mx;
- $re_table = qr{
- ( # $1 = whole table
- $re_row_demarcation # row of dashes, before the header text
- ($re_header_row) # $2 = header row
- $re_table_divider_row # table divider
- (${re_table_rows}+) # $3 = rows in table body
- (?:\z|\n) # then a blank line
- )
- }mx;
- $text =~ s{$re_table}{
- my $header = $2;
- my $rows = $3;
- my $bbc = "";
- my @rows = split($re_row_demarcation, $rows);
- for my $r (@rows) {
- $bbc .= "⟦tr⟧" .
- join('', map {'⟦td⟧' . _RunBlockGamut(_Outdent($_)) . '⟦/td⟧'} get_grid_row($r)) .
- "⟦/tr⟧\n";
- }
- $bbc = "⟦table⟧⟦thead⟧⟦tr⟧" . join('', map {'⟦th⟧' . $_ . '⟦/th⟧'} get_grid_row($header)) . "⟦/tr⟧⟦/thead⟧\n" .
- "⟦tbody⟧" . $bbc . "⟦/tbody⟧⟦/table⟧\n\n";
- $bbc;
- }egmx;
- #
- # Pipe tables
- #
- $re_table_divider_column = qr{
- [ \t]*
- :?
- -{3,}
- :?
- [ \t]*
- }x;
- $re_table_divider = qr{
- \|? # Optional first pipe
- (?:${re_table_divider_column}) # Divider Head
- (?:(?:\+|\|)${re_table_divider_column})* # Divider Tail
- \|? # Optional last pipe
- \n
- }mx;
- $re_table_row = qr{
- \|? # Optional first pipe
- [^\|\n]+ # Row head
- (?:\|[^\|\n]+)* # Row tail
- \|? # Optional last pipe
- (?:\n|\z)
- }mx;
- $re_table = qr{
- ( # $1 = whole table
- ($re_table_row) # $2 = header row
- ($re_table_divider) # $3 = table divider
- (${re_table_row}+) # $4 = rows in table body
- (\z|\n) # $5
- )
- }mx;
- $text =~ s{
- $re_table
- }{
- my $table = $1;
- my $header = $2;
- my $divider = $3;
- my $body = $4;
- my @headings = map {trim($_)} ($header =~ /[^\|\n]+/g);
- my @cols = $divider =~ /${re_table_divider_column}/g;
- my @rows = $body =~ /${re_table_row}/g;
- # BBCode
- my $bbc = "";
- my $a = "";
- my $c = 0;
- my $n = scalar @cols;
- for(my $r=0; $r < scalar @rows; $r++) {
- $bbc .= "⟦tr⟧" .
- join('', map {'⟦td' . (($a = align($cols[$c++ % $n])) ne "" ? (' align="' . $a . '"') : '') . '⟧' . $_ . '⟦/td⟧'} cells($rows[$r])) .
- "⟦/tr⟧\n";
- }
- $bbc = "⟦table⟧⟦thead⟧⟦tr⟧" . join('', map {'⟦th⟧' . $_ . '⟦/th⟧'} @headings) . "⟦/tr⟧⟦/thead⟧\n" .
- "⟦tbody⟧" . $bbc . "⟦/tbody⟧⟦/table⟧\n\n";
- $bbc;
- }egmx;
- return $text;
- }
- sub align {
- my $text = shift;
- if($text =~ /\:-{3,}\:/){ return "center"; }
- if($text =~ /\:-{3,}/){ return "left"; }
- if($text =~ /-{3,}\:/){ return "right"; }
- return "";
- }
- sub trim {
- my $text = shift;
- $text =~ s/^\s+//;
- $text =~ s/\s+$//;
- return $text;
- }
- sub cells {
- my $text = shift;
- $text = trim($text);
- $text =~ s/^\|//;
- $text =~ s/\|$//;
- my @cells = map {trim($_)} (split /\|/, $text);
- return @cells;
- }
- sub get_grid_row {
- my $row = shift;
- my @cols = ();
- my @lines = split(/\n/, $row);
- foreach my $l (@lines) {
- my $c = 0;
- $l =~ s/^\s*\|//;
- $l =~ s/\|\s*$//;
- map {$cols[$c++].=trim($_)."\n"} split(/\|/, $l);
- }
- # return map {$_ =~ s/\n+\z//; $_} @cols;
- return @cols;
- }
- sub get_column_widths {
- my $divider = shift;
- my $trim_trailing_spaces = shift;
- $divider =~ s/\s+$//;
- my @divs = split(/(?<=\s)(?=-)/, $divider);
- if($trim_trailing_spaces){ my @temp = map { s/(\s+)$//mgx; $_; } @divs; @divs = @temp; }
- my @widths = map {length $_} @divs;
- my $n = scalar @widths;
- my $i = 0;
- # print "$n columns:\n";
- # foreach $w (@widths) {print "column ", $i, ": ", $w, " width: '", $divs[$i++] ,"'\n";}
- # print "\n\n";
- return @widths;
- }
- sub get_row_alignments {
- my @lines = split(/\n/, shift); # A row consists of more than one line
- my $divider = shift;
- my @widths = get_column_widths($divider, 0);
- my @divs = get_column_widths($divider, 1);
- my $offset;
- my $n = scalar @widths;
- my @aligns = ();
- my $l = $lines[0];
- $offset = 0;
- for(my $c = 0; $c < $n; $c++) {
- my $col .= substr($l, $offset, $divs[$c]);
- if($col =~ /^\s+[^\s]+\s+$/){ $aligns[$c] = "center"; }
- elsif($col =~ /^\s+[^\s]+$/){ $aligns[$c] = "right"; }
- elsif($col =~ /^[^\s]+\s+$/){ $aligns[$c] = "left"; }
- elsif(length($col) < $divs[$c]){ $aligns[$c] = "left"; }
- else { $aligns[$c] = "left"; } # default
- $offset += $widths[$c];
- # print "column ", $c, " is '", $col, "' with align = '", $aligns[$c], "'\n";
- }
- return @aligns;
- }
- sub get_fixed_width_row {
- my @lines = split(/\n/, shift); # A row consists of more than one line
- my $divider = shift;
- my $is_header = shift || 0;
- my $rv;
- my @widths = get_column_widths($divider);
- my $offset;
- my $n = scalar @widths;
- my @cols = ();
- for my $l (@lines) {
- $offset = 0;
- for(my $c = 0; $c < $n; $c++) {
- $cols[$c] .= trim(substr($l, $offset, $widths[$c])) . "\n";
- $offset += $widths[$c];
- }
- };
- # for(my $c = 0; $c < $n; $c++) {
- # # trim trailing blank lines:
- # $cols[$c] =~ s/\n+\z//;
- # }
- return @cols;
- };
- sub _DoLists {
- #
- # Form HTML ordered (numbered) and unordered (bulleted) lists.
- #
- my $text = shift;
- my $less_than_tab = $g_tab_width - 1;
- # Re-usable patterns to match list item bullets and number markers:
- my $marker_ul = qr/[*+-]/;
- my $marker_ol = qr/\d+[.]/;
- my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
- # Re-usable pattern to match any entirel ul or ol list:
- my $whole_list = qr{
- ( # $1 = whole list
- ( # $2
- [ ]{0,$less_than_tab}
- (${marker_any}) # $3 = first list item marker
- [ \t]+
- )
- (?s:.+?)
- ( # $4
- \z
- |
- \n{2,}
- (?=\S)
- (?! # Negative lookahead for another list item marker
- [ \t]*
- ${marker_any}[ \t]+
- )
- )
- )
- }mx;
- # We use a different prefix before nested lists than top-level lists.
- # See extended comment in _ProcessListItems().
- #
- # Note: There's a bit of duplication here. My original implementation
- # created a scalar regex pattern as the conditional result of the test on
- # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
- # substitution once, using the scalar as the pattern. This worked,
- # everywhere except when running under MT on my hosting account at Pair
- # Networks. There, this caused all rebuilds to be killed by the reaper (or
- # perhaps they crashed, but that seems incredibly unlikely given that the
- # same script on the same server ran fine *except* under MT. I've spent
- # more time trying to figure out why this is happening than I'd like to
- # admit. My only guess, backed up by the fact that this workaround works,
- # is that Perl optimizes the substition when it can figure out that the
- # pattern will never change, and when this optimization isn't on, we run
- # afoul of the reaper. Thus, the slightly redundant code to that uses two
- # static s/// patterns rather than one conditional pattern.
- if ($g_list_level) {
- $text =~ s{
- ^
- $whole_list
- }{
- my $list = $1;
- my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
- # Turn double returns into triple returns, so that we can make a
- # paragraph for the last item in a list, if necessary:
- $list =~ s/\n{2,}/\n\n\n/g;
- my $result = _ProcessListItems($list, $marker_any);
- $result = "⟦$list_type⟧\n" . $result . "⟦/$list_type⟧\n";
- $result;
- }egmx;
- }
- else {
- $text =~ s{
- (?:(?<=\n\n)|\A\n?)
- $whole_list
- }{
- my $list = $1;
- my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
- # Turn double returns into triple returns, so that we can make a
- # paragraph for the last item in a list, if necessary:
- $list =~ s/\n{2,}/\n\n\n/g;
- my $result = _ProcessListItems($list, $marker_any);
- $result = "⟦$list_type⟧\n" . $result . "⟦/$list_type⟧\n";
- $result;
- }egmx;
- }
- return $text;
- }
- sub _ProcessListItems {
- #
- # Process the contents of a single ordered or unordered list, splitting it
- # into individual list items.
- #
- my $list_str = shift;
- my $marker_any = shift;
- # The $g_list_level global keeps track of when we're inside a list.
- # Each time we enter a list, we increment it; when we leave a list,
- # we decrement. If it's zero, we're not in a list anymore.
- #
- # We do this because when we're not inside a list, we want to treat
- # something like this:
- #
- # I recommend upgrading to version
- # 8. Oops, now this line is treated
- # as a sub-list.
- #
- # As a single paragraph, despite the fact that the second line starts
- # with a digit-period-space sequence.
- #
- # Whereas when we're inside a list (or sub-list), that line will be
- # treated as the start of a sub-list. What a kludge, huh? This is
- # an aspect of Markdown's syntax that's hard to parse perfectly
- # without resorting to mind-reading. Perhaps the solution is to
- # change the syntax rules such that sub-lists must start with a
- # starting cardinal number; e.g. "1." or "a.".
- $g_list_level++;
- # trim trailing blank lines:
- $list_str =~ s/\n{2,}\z/\n/;
- $list_str =~ s{
- (\n)? # leading line = $1
- (^[ \t]*) # leading whitespace = $2
- ($marker_any) [ \t]+ # list marker = $3
- ((?s:.+?) # list item text = $4
- (\n{1,2}))
- (?= \n* (\z | \2 ($marker_any) [ \t]+))
- }{
- my $item = $4;
- my $leading_line = $1;
- my $leading_space = $2;
- if ($leading_line or ($item =~ m/\n{2,}/)) {
- $item = _RunBlockGamut(_Outdent($item));
- }
- else {
- # Recursion for sub-lists:
- $item = _DoLists(_Outdent($item));
- chomp $item;
- $item = _RunSpanGamut($item);
- }
- "⟦li⟧" . $item . "⟦/li⟧\n";
- }egmx;
- $g_list_level--;
- return $list_str;
- }
- sub _DoCodeBlocks {
- #
- # Process Markdown `<pre><code>` blocks.
- #
- my $text = shift;
- $text =~ s{
- (?:\n\n|\A)
- ( # $1 = the code block -- one or more lines, starting with a space/tab
- (?:
- (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces
- .*\n+
- )+
- )
- ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
- }{
- my $codeblock = $1;
- my $result; # return value
- $codeblock = _EncodeCode(_Outdent($codeblock));
- $codeblock = _Detab($codeblock);
- $codeblock =~ s/\A\n+//; # trim leading newlines
- $codeblock =~ s/\s+\z//; # trim trailing whitespace
- $result = "\n\n⟦code⟧" . $codeblock . "⟦/code⟧\n\n";
- $result;
- }egmx;
- return $text;
- }
- sub _DoCodeSpans {
- #
- # * Backtick quotes are used for <code></code> spans.
- #
- # * You can use multiple backticks as the delimiters if you want to
- # include literal backticks in the code span. So, this input:
- #
- # Just type ``foo `bar` baz`` at the prompt.
- #
- # Will translate to:
- #
- # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
- #
- # There's no arbitrary limit to the number of backticks you
- # can use as delimters. If you need three consecutive backticks
- # in your code, use four for delimiters, etc.
- #
- # * You can use spaces to get literal backticks at the edges:
- #
- # ... type `` `bar` `` ...
- #
- # Turns to:
- #
- # ... type <code>`bar`</code> ...
- #
- my $text = shift;
- $text =~ s@
- (`+) # $1 = Opening run of `
- (.+?) # $2 = The code block
- (?<!`)
- \1 # Matching closer
- (?!`)
- @
- my $c = "$2";
- $c =~ s/^[ \t]*//g; # leading whitespace
- $c =~ s/[ \t]*$//g; # trailing whitespace
- $c = _EncodeCode($c);
- "⟦code⟧$c⟦/code⟧";
- @egsx;
- return $text;
- }
- sub _EncodeCode {
- #
- # Encode/escape certain characters inside Markdown code runs.
- # The point is that in code, these characters are literals,
- # and lose their special Markdown meanings.
- #
- local $_ = shift;
- # Encode all ampersands; HTML entities are not
- # entities within a Markdown code span.
- s/&/&/g;
- # Encode $'s, but only if we're running under Blosxom.
- # (Blosxom interpolates Perl variables in article bodies.)
- {
- no warnings 'once';
- if (defined($blosxom::version)) {
- s/\$/$/g;
- }
- }
- # Do the angle bracket song and dance:
- s! < !<!gx;
- s! > !>!gx;
- # Now, escape characters that are magic in Markdown:
- s! \* !$g_escape_table{'*'}!gx;
- s! _ !$g_escape_table{'_'}!gx;
- s! ~ !$g_escape_table{'~'}!gx;
- s! { !$g_escape_table{'{'}!gx;
- s! } !$g_escape_table{'}'}!gx;
- s! \[ !$g_escape_table{'['}!gx;
- s! \] !$g_escape_table{']'}!gx;
- s! \\ !$g_escape_table{'\\'}!gx;
- return $_;
- }
- sub _DoItalicsAndBold {
- my $text = shift;
- $text =~ s{ __ (?=\S) (.+?) (?<=\S) __ }
- {⟦u⟧$1⟦/u⟧}gsx;
- $text =~ s{ \*\* (?=\S) (.+?) (?<=\S) \*\* }
- {⟦b⟧$1⟦/b⟧}gsx;
- $text =~ s{ ~~ (?=\S) (.+?) (?<=\S) ~~ }
- {⟦s⟧$1⟦/s⟧}gsx;
- $text =~ s{ \^\^ (?=\S) (.+?) (?<=\S) \^\^ }
- {⟦font size="5"⟧$1⟦/font⟧}gsx;
- $text =~ s{ \^ (?=\S) (.+?) (?<=\S) \^ }
- {⟦sup⟧$1⟦/sup⟧}gsx;
- $text =~ s{ ~ (?=\S) (.+?) (?<=\S) ~ }
- {⟦sub⟧$1⟦/sub⟧}gsx;
- $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
- {⟦i⟧$2⟦/i⟧}gsx;
- return $text;
- }
- sub _DoBlockQuotes {
- my $text = shift;
- $text =~ s{
- ( # Wrap whole match in $1
- (
- ^[ \t]*>[ \t]? # '>' at the start of a line
- .+\n # rest of the first line
- (.+\n)* # subsequent consecutive lines
- \n* # blanks
- )+
- )
- }{
- my $bq = $1;
- $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
- $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
- $bq = _RunBlockGamut($bq); # recurse
- # $bq =~ s/^/ /g;
- # These leading spaces screw with <pre> content, so we need to fix that:
- $bq =~ s{
- (\s*\[pre\].+?\[/pre\])
- }{
- my $pre = $1;
- $pre =~ s/^ //mg;
- $pre;
- }egsx;
- "⟦quote⟧$bq⟦/quote⟧\n\n";
- }egmx;
- return $text;
- }
- sub _FormParagraphs {
- #
- # Params:
- # $text - string to process with html <p> tags
- #
- my $text = shift;
- # Strip leading and trailing lines:
- $text =~ s/\A\n+//;
- $text =~ s/\n+\z//;
- my @grafs = split(/\n{2,}/, $text);
- #
- # _RunSpanGamut for each paragraph
- #
- foreach (@grafs) {
- unless (defined( $g_html_blocks{$_} )) {
- $_ = _RunSpanGamut($_);
- s/^([ \t]*)//;
- }
- }
- #
- # Unhashify BBCode blocks
- #
- foreach (@grafs) {
- if (defined( $g_html_blocks{$_} )) {
- $_ = $g_html_blocks{$_};
- }
- }
- return join "\n\n", @grafs;
- }
- sub _EncodeAmpsAndAngles {
- # Smart processing for ampersands and angle brackets that need to be encoded.
- my $text = shift;
- # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
- # http://bumppo.net/projects/amputator/
- $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g;
- # Encode naked <'s
- $text =~ s{<(?![a-z/?\$!])}{<}gi;
- return $text;
- }
- sub _EncodeBackslashEscapes {
- #
- # Parameter: String.
- # Returns: The string, with after processing the following backslash
- # escape sequences.
- #
- local $_ = shift;
- s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first.
- s! \\` !$g_escape_table{'`'}!gx;
- s! \\\* !$g_escape_table{'*'}!gx;
- s! \\_ !$g_escape_table{'_'}!gx;
- s{ \\~ }{$g_escape_table{'~'}}gx;
- s! \\\{ !$g_escape_table{'{'}!gx;
- s! \\\} !$g_escape_table{'}'}!gx;
- s! \\\[ !$g_escape_table{'['}!gx;
- s! \\\] !$g_escape_table{']'}!gx;
- s! \\\( !$g_escape_table{'('}!gx;
- s! \\\) !$g_escape_table{')'}!gx;
- s! \\> !$g_escape_table{'>'}!gx;
- s! \\\# !$g_escape_table{'#'}!gx;
- s! \\\+ !$g_escape_table{'+'}!gx;
- s! \\\- !$g_escape_table{'-'}!gx;
- s! \\\. !$g_escape_table{'.'}!gx;
- s{ \\! }{$g_escape_table{'!'}}gx;
- return $_;
- }
- sub _DoAutoLinks {
- my $text = shift;
- $text =~ s{<((https?|ftp):[^'">\s]+)>}{⟦a href="$1"⟧$1⟦/a⟧}gi;
- # Email addresses: <address@domain.foo>
- $text =~ s{
- <
- (?:mailto:)?
- (
- [-.\w]+
- \@
- [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
- )
- >
- }{
- _EncodeEmailAddress( _UnescapeSpecialChars($1) );
- }egix;
- return $text;
- }
- sub _EncodeEmailAddress {
- #
- # Input: an email address, e.g. "foo@example.com"
- #
- # Output: the email address as a mailto link, with each character
- # of the address encoded as either a decimal or hex entity, in
- # the hopes of foiling most address harvesting spam bots. E.g.:
- #
- # <a href="mailto:foo@e
- # xample.com">foo
- # @example.com</a>
- #
- # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
- # mailing list: <http://tinyurl.com/yu7ue>
- #
- my $addr = shift;
- srand;
- my @encode = (
- sub { '&#' . ord(shift) . ';' },
- sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
- sub { shift },
- );
- $addr = "mailto:" . $addr;
- $addr =~ s{(.)}{
- my $char = $1;
- if ( $char eq '@' ) {
- # this *must* be encoded. I insist.
- $char = $encode[int rand 1]->($char);
- } elsif ( $char ne ':' ) {
- # leave ':' alone (to spot mailto: later)
- my $r = rand;
- # roughly 10% raw, 45% hex, 45% dec
- $char = (
- $r > .9 ? $encode[2]->($char) :
- $r < .45 ? $encode[1]->($char) :
- $encode[0]->($char)
- );
- }
- $char;
- }gex;
- $addr = qq{⟦a href="$addr"⟧$addr⟦/a⟧};
- $addr =~ s{"⟧.+?:}{"⟧}; # strip the mailto: from the visible part
- return $addr;
- }
- sub _UnescapeSpecialChars {
- #
- # Swap back in all the special characters we've hidden.
- #
- my $text = shift;
- while( my($char, $hash) = each(%g_escape_table) ) {
- $text =~ s/$hash/$char/g;
- }
- return $text;
- }
- sub _TokenizeHTML {
- #
- # Parameter: String containing HTML markup.
- # Returns: Reference to an array of the tokens comprising the input
- # string. Each token is either a tag (possibly with nested,
- # tags contained therein, such as <a href="<MTFoo>">, or a
- # run of text between tags. Each element of the array is a
- # two-element array; the first is either 'tag' or 'text';
- # the second is the actual value.
- #
- #
- # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
- # <http://www.bradchoate.com/past/mtregex.php>
- #
- my $str = shift;
- my $pos = 0;
- my $len = length $str;
- my @tokens;
- my $depth = 6;
- my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
- my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
- (?s: <\? .*? \?> ) | # processing instruction
- $nested_tags/ix; # nested tags
- while ($str =~ m/($match)/g) {
- my $whole_tag = $1;
- my $sec_start = pos $str;
- my $tag_start = $sec_start - length $whole_tag;
- if ($pos < $tag_start) {
- push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
- }
- push @tokens, ['tag', $whole_tag];
- $pos = pos $str;
- }
- push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
- \@tokens;
- }
- sub _Outdent {
- #
- # Remove one level of line-leading tabs or spaces
- #
- my $text = shift;
- $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
- return $text;
- }
- sub _Detab {
- #
- # Cribbed from a post by Bart Lateur:
- # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
- #
- my $text = shift;
- $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
- return $text;
- }
- sub _PreProcess {
- my $text = shift;
- # Here we process stuff like @[include](file) that must be done first
- # `include` inserts a Markdown file without running Markdown2BBCode.pl on it first like `process`
- # so included files should not contain YAML metadata unless the include statement is the first line!
- $text =~ s{
- @ \[ include \]
- \( # literal paren
- ($g_nested_parens) # filename = $1
- \)
- }{
- local $/=undef;
- open FILE, $1 or die "Couldn't open file: $!";
- my $string = <FILE>;
- close FILE;
- $string;
- }exg;
- return $text;
- }
- sub _PostProcess {
- my $text = shift;
- # Remove HTML comments
- $text =~ s/<! ( -- .*? -- \s* )+ >//gsx;
- # Translate double square brackets we use internally back to the regular square brackets BBCode uses
- $text =~ tr/⟦⟧/\[\]/;
- # @[process](file)
- # Unlike `include`, `process` actually runs Markdown2BBCode.pl on a file and includes the results
- $text =~ s{
- @ \[ process \]
- \( # literal paren
- ($g_nested_parens) # filename = $1
- \)
- }{
- `perl Markdown2BBCode.pl $1`;
- }egx;
- # @[noprocess](file)
- # Unlike `process`, `noprocess` does no processing on a file. It includes it literally
- $text =~ s{
- @ \[ noprocess \]
- \( # literal paren
- ($g_nested_parens) # filename = $1
- \)
- }{
- $matharr[$mathctr] = `cat $1`;
- "(※" . $mathctr++ . ")"
- }egx;
- # Put math formulae back, unescape dollar signs
- $text =~ s/\(※(\d+)\)/$matharr[$1]/gsex;
- $text =~ s/\\\$/\$/gs;
- return $text;
- }
- #
- # YAML stuff
- #
- sub _InitYAML {
- my $text = shift;
- $text =~ s{
- ^
- (
- (?: \% YAML \s+ [\d\.]+ \n)?
- --- \n
- (?: .* \n)+?
- (--- | \.\.\.) \n
- )
- }{
- my $string = $1;
- if(length $string > 0) {
- # Found some YAML at the beginning of the doc
- # Let YAML::Tiny parse it
- $yaml = YAML::Tiny->read_string($string);
- } else {
- # No YAML Found! Create an empty YAML object
- $yaml = new YAML::Tiny;
- }
- ""
- }mex;
- return $text;
- }
- sub _DoMetadataVariables {
- my $text = shift;
- # Substitute tags like this
- # [%key]
- # with the YAML metadata from the file header corresponding to the key
- $text =~ s{
- ( # wrap whole match in $1
- \[
- \%
- (.*?) # key = $2
- \]
- )
- }{
- my $key = $2;
- my $value = $yaml->[0]->{$key};
- if($value eq ""){
- $value = $key;
- }
- $value;
- }gsex;
- return $text;
- }
- #
- # LaTeX Math stuff
- #
- sub _InitMath {
- @matharr = ();
- $mathctr = 1;
- # Commonly used symbols
- %mathsym = (
- # Operators
- pm => '±',
- cdot => '·',
- div => '÷',
- times => '×',
- prod => '∏',
- sum => '∑',
- surd => '√',
- cuberoot => '∛',
- wedge => '∧',
- vee => '∨',
- oplus => '⊕',
- otimes => '⊗',
- ast => '∗',
- circ => '°',
- bowtie => '⋈',
- # Comparison
- neq => '≠',
- approx => '≈',
- napprox => '≉',
- propto => '∝',
- equiv => '≡',
- nequiv => '≢',
- leq => '≤',
- geq => '≥',
- ll => '≪',
- gg => '≫',
- # Calculus
- ldots => '...',
- infty => '∞',
- prime => '′',
- second => '″',
- partial => '∂',
- nabla => '∇',
- int => '∫',
- iint => '∬',
- iiint => '∭',
- oint => '∮',
- oiint => '∯',
- oiiint => '∰',
- sumint => '⨋',
- # Script
- ell => 'ℓ',
- Re => 'ℜ',
- Im => 'ℑ',
- # Logic
- neg => '¬',
- forall => '∀',
- exists => '∃',
- nexists => '∄',
- therefore => '∴',
- QED => '∎',
- vdash => '⊢',
- vDash => '⊨',
- bot => '⊥',
- top => '⊤',
- # Set theory
- in => '∈',
- setminus => '\\',
- backslash => '\\',
- varnothing => '∅',
- emptyset => '∅',
- cap => '∩',
- cup => '∪',
- subset => '⊂',
- supset => '⊃',
- subseteq => '⊆',
- subsetneq => '⊊',
- aleph => 'ℵ',
- # Physics
- Planckconst => 'ℎ',
- hslash => 'ℏ',
- langle => '⟨',
- rangle => '⟩',
- # Geometry
- rightangle => '∟',
- angle => '∠',
- parallel => '∥',
- nparallel => '∦',
- perp => '⟂',
- bigtriangleup => '△',
- square => '◻',
- blacksquare => '◼',
- hrectangle => '▭',
- vrectangle => '▯',
- parallelogram => '▱',
- diamond => '⋄',
- Diamond => '◇',
- Diamondblack => '◆',
- # Spaces
- quad => ' ',
- qquad => ' ',
- textvisiblespace => '␣',
- # Money
- cent => '¢',
- pounds => '£',
- yen => '¥',
- euro => '€',
- # Arrows
- to => '→',
- rightarrow => '→',
- mapsto => '↦',
- Rightarrow => '⇒',
- Leftrightarrow => '⇔',
- hookrightarrow => '↪',
- uparrow => '↑',
- downarrow => '↓',
- leftarrow => '←',
- rightrightarrows => '⇉',
- leadsto => '⤳',
- # Typographical marks
- P => '', # Pilcrow
- S => '§', # Section sign
- ddag => '‡',
- dag => '†',
- textquestiondown => '¿',
- textexclamdown => '¡',
- # Dingbats
- dagger => '†',
- checkmark => '✓',
- ballotx => '✗',
- danger => '☡',
- # Music
- sharp => '♯',
- natural => '♮',
- flat => '♭',
- # Card suits
- spadesuit => '♠',
- heartsuit => '♡',
- diamondsuit => '♢',
- clubsuit => '♣',
- # Legal
- copyright => 'Ⓒ',
- texttrademark => '™',
- textregistered => '®',
- # Greek letters
- Alpha => 'Α',
- alpha => 'α',
- Beta => 'Β',
- beta => 'β',
- Gamma => 'Γ',
- gamma => 'γ',
- Delta => 'Δ',
- delta => 'δ',
- Epsilon => 'Ε',
- epsilon => 'ε',
- Zeta => 'Ζ',
- zeta => 'ζ',
- Eta => 'Η',
- eta => 'η',
- Theta => 'Θ',
- theta => 'θ',
- Iota => 'Ι',
- iota => 'ι',
- Kappa => 'Κ',
- kappa => 'κ',
- Lambda => 'Λ',
- lambda => 'λ',
- Mu => 'Μ',
- mu => 'μ',
- Nu => 'Ν',
- nu => 'ν',
- Xi => 'Ξ',
- xi => 'ξ',
- Omicron => 'Ο',
- omicron => 'ο',
- Pi => 'Π',
- pi => 'π',
- Rho => 'Ρ',
- rho => 'ρ',
- Sigma => 'Σ',
- sigma => 'σ',
- Tau => 'Τ',
- tau => 'τ',
- Upsilon => 'Υ',
- upsilon => 'υ',
- Phi => 'Φ',
- phi => 'φ',
- Chi => 'Χ',
- chi => 'χ',
- Psi => 'Ψ',
- psi => 'ψ',
- Omega => 'Ω',
- omega => 'ω',
- # Catch "text" tag
- text => '::text'
- );
- # Blackboard
- %mathbb = (
- N => 'ℕ', # Natural numbers
- Z => 'ℤ', # Integers
- Q => 'ℚ', # Rationals
- R => 'ℝ', # Reals
- C => 'ℂ', # Complex
- H => 'ℍ' # Quaternions
- );
- # Calligraphy
- %mathcal = (
- e => 'ℯ',
- E => 'ℰ',
- F => 'ℱ',
- H => 'ℋ', # Hamiltonian
- L => 'ℒ', # Lagrangian
- M => 'ℳ'
- );
- # Six-sided dice faces
- %dice = (
- i => '⚀',
- ii => '⚁',
- iii => '⚂',
- iv => '⚃',
- v => '⚄',
- vi => '⚅'
- );
- }
- sub num {
- my $n = shift;
- $n = _DoTheMath($n);
- if($n =~ /^\s*([\d\.]+)\s*$/){ return $1; }
- if( $n !~ /\s/ ){ return $n; }
- return "(" . $n . ")" ;
- }
- sub frac {
- my ($a, $b) = @_;
- return num($a) . "/" . num($b);
- }
- sub _DoTheMath {
- my $text = shift;
- $text =~ s/_\{($g_nested_braces)\}/\[sub\]$1\[\/sub\]/g;
- $text =~ s/\^\{($g_nested_braces)\}/\[sup\]$1\[\/sup\]/g;
- $text =~ s/_(\\?\w+)/\[sub\]$1\[\/sub\]/g;
- $text =~ s/\^(\\?\w+)/\[sup\]$1\[\/sup\]/g;
- $text =~ s/\\not\s*\\in/∉/g;
- $text =~ s/\\frac\{($g_nested_braces)\}\{($g_nested_braces)\}/frac($1,$2)/eg;
- $text =~ s/\\sqrt\{($g_nested_braces)\}/"√".num($1)/eg;
- $text =~ s/\\mathbb\{(\w+)\}/$mathbb{$1}/eg;
- $text =~ s/\\mathcal\{(\w+)\}/$mathcal{$1}/eg;
- $text =~ s/\\dice\{([vi]{1,3})\}/$dice{$1}/eg;
- $text =~ s/\\pmod\s+(\w+)/"(mod " . $1 . ")"/eg;
- $text =~ s/\\bmod\s+(\w+)/"mod " . $1/eg;
- $text =~ s/\\(sinh|cosh|tanh|coth|arcsin|arccos|arctan|arccot|sin|cos|tan|cot|sec|csc|log|exp)[ ]*/$1/g;
- $text =~ s/\\([a-zA-Z]+)[ ]*/$mathsym{$1}/eg;
- $text =~ s/::text\{([^\}]+)\}/$1/g; # TODO: This needs to be hashed at the beginning and restored at the end to prevent interpoloation!
- # Escapes
- $text =~ s/\\!//g;
- $text =~ s/\\,//g;
- $text =~ s/\\:/ /g;
- $text =~ s/\\;/ /g;
- #$text =~ s/\\>/ /g;
- $text =~ s/\\\}/\}/g;
- $text =~ s/\\\{/\{/g;
- $text =~ s/\\\\/\\/g;
- # normalize whitespace
- #$text =~ s/^\s+//g;
- #$text =~ s/\s+$//g;
- #$text =~ s/\s+/ /g;
- return $text;
- }
- sub _HashMathSpans {
- my $text = shift;
- $text =~ s{
- (?<! \\ )
- \$
- (?! \s )
- ( .+? )
- (?<! \s )
- \$
- }{
- $matharr[$mathctr] = _DoTheMath($1);
- "(※" . $mathctr++ . ")"
- }egmx;
- return $text;
- }
- 1;
- __END__
- =pod
- =head1 NAME
- B<Markdown2BBCode>
- =head1 SYNOPSIS
- B<Markdown2BBCode.pl> [ B<--version> ] [ B<-shortversion> ]
- [ I<file> ... ]
- =head1 DESCRIPTION
- Markdown2BBCode.pl is a text-to-BBCode filter; it translates an easy-to-read /
- easy-to-write structured text format into BBCode. Markdown's text format
- is most similar to that of plain text email, and supports features such
- as headers, *emphasis*, code blocks, blockquotes, and links.
- Markdown2BBCode's syntax is designed not as a generic markup language, but
- specifically to serve as a front-end to BBCode. You can use span-level
- BBCode tags anywhere in a Markdown2BBCode document, and you can use block level
- BBCode tags (like [div] and [table] as well).
- For more information about Markdown's syntax, see:
- http://daringfireball.net/projects/markdown/
- =head1 OPTIONS
- Use "--" to end switch parsing. For example, to open a file named "-z", use:
- Markdown2BBCode.pl -- -z
- =over 4
- =item B<-v>, B<--version>
- Display Markdown2BBCode's version number and copyright information.
- =item B<-s>, B<--shortversion>
- Display the short-form version number.
- =back
- =head1 BUGS
- To file bug reports or feature requests (other than topics listed in the
- Caveats section above) please send email to:
- krusader74 at yahoo dot com
- Please include with your report: (1) the example input; (2) the output
- you expected; (3) the output Markdown2BBCode.pl actually produced.
- =head1 VERSION HISTORY
- See the readme file for detailed release notes for this version.
- 2.0 -- 26 Jan 2016
- 3.0 -- 20 Feb 2016
- * Fixed: The \LaTex{} expressions \frac{}, \sqrt{}, ^{}, _{}
- now allow for nested braces.
- * Added: Preliminary support for Pandoc simple tables, multiline
- tables, and grid tables. See http://pandoc.org/README.html#tables
- =head1 AUTHOR
- Original Markdown.pl Perl script by John Gruber
- http://daringfireball.net
- PHP port and other contributions by Michel Fortin
- http://michelf.com
- BBCode port and other contributions by Krusader74
- http://odd74.proboards.com/user/2141
- http://pastebin.com/u/krusader74
- krusader74 at yahoo dot com
- =head1 COPYRIGHT AND LICENSE
- Copyright (c) 2003-2004 John Gruber
- <http://daringfireball.net/>
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name "Markdown" nor the names of its contributors may
- be used to endorse or promote products derived from this software
- without specific prior written permission.
- This software is provided by the copyright holders and contributors "as
- is" and any express or implied warranties, including, but not limited
- to, the implied warranties of merchantability and fitness for a
- particular purpose are disclaimed. In no event shall the copyright owner
- or contributors be liable for any direct, indirect, incidental, special,
- exemplary, or consequential damages (including, but not limited to,
- procurement of substitute goods or services; loss of use, data, or
- profits; or business interruption) however caused and on any theory of
- liability, whether in contract, strict liability, or tort (including
- negligence or otherwise) arising in any way out of the use of this
- software, even if advised of the possibility of such damage.
- =cut
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement