
use warnings;
use strict;

package Mutulate;
use base 'Exporter';

our @EXPORT = qw/msg_mutulate/;

use Log::Report      qw(simplelists);
use Mail::Message;
use Mail::Message::Part;
use Mail::Message::Body::String;
#use HTML::Scrubber;

use Encode;
            
sub _invalidate_plain($);
sub _remove_plain_alternative($$);
sub _add_plain_alternative($$);
sub _remove_html_alternative($$);

=pod

  my $mod = msg_mutulate $msg, @options;

As @options, you may pass anything accepted by _invalidate_plain
    always_remove_plain_alternative  boolean
    invalidate_plain_alternative     percentage (1-100)
    add_plain_alternative_for_html   boolean
    remove_html_alternative          boolean

=cut

sub msg_mutulate($@)
{   my ($msg, %args) = @_;

    my @rules;

    if($args{always_remove_plain_alternative})
    {   push @rules, \&_remove_plain_alternative;
    }
    elsif(my $threshold = $args{invalidate_plain_alternative})
    {   push @rules, _invalidate_plain($threshold/100);
    }

    if($args{add_plain_alternative_for_html})
    {   push @rules, \&_add_plain_alternative;
    }

    if($args{remove_html_alternative})
    {   push @rules, \&_remove_html_alternative;
    }

    push @rules, \&_remove_empty;

    $msg->rebuild
      ( keep_message_id => 1
      , extra_rules     => \@rules
      );
}

=pod

   my $code = _invalidate_plain $threshold

Produce a function which is called on each part to remove plain
alternatives to html which are useless.

The $threshold is a percentage.  When the number of characters in the
HTML is "percentage" larger or smaller than the number of characters in
the PLAIN, then the alternative multipart is replaced by a simple HTML.

=cut

sub _remove_empty($$)
{   my ($msg, $part) = @_;

    return undef if $part->body->decoded =~ /^\s*$/;
    return $part;    
}

sub _invalidate_plain($)
{   my $threshold = shift;

    sub {
        my ($msg, $part) = @_;

        # Already processed?  Processing is expensive
        return $part
            if $part->head->get('X-SL-Modified');

        $part->body->mimeType eq 'multipart/alternative'
            or return $part;

        my @parts = $part->body->parts;

        # Don't know how to handle more than 2 alternatives, yet
        @parts==2
            or return $part;

        my $mt1 = $parts[0]->body->mimeType;  # smart comparison object!
        my $mt2 = $parts[1]->body->mimeType;
        my ($plain_part, $html_part)
           = $mt1 eq 'text/plain' && $mt2 eq 'text/html'  ? @parts
           : $mt1 eq 'text/html'  && $mt2 eq 'text/plain' ? @parts[1,0]
           : return $part;

        my $plain_text = $plain_part->body->decoded->string;
        my $html_text  = $html_part->body->decoded->string;

        # cleanup plain, even white space is reduced
        $plain_text =~ s/\s+/ /g;

        # cleanup html
        $html_text  =~ s# \<\s*script\b .*? \<\s*\/\s*script\s*\> ##igxs;
        $html_text  =~ s# \<[^>]*\> ##gxs;
        $html_text  =~ s# \s+ # #gxs;
        if(length $html_text)
        {   my $relative = length($plain_text) / length($html_text);

            return $part  # keep
               if $relative >= 1-$threshold
               && $relative <= 1+$threshold;
        }

        info __x"invalidated plain alternative for html in {msgid}, t={thres}"
          , msgid => $msg->messageId, thres => $threshold;

        $html_part->head->add('X-SL-Modified' =>
           "invalidate_plain_alternative($threshold)");

        $html_part->container(undef);
        $html_part;
    };
}

sub _remove_plain_alternative($$)
{   my ($msg, $part) = @_;

    # Already processed?  Processing is expensive
    return $part
        if $part->head->get('X-SL-Modified');

    $part->body->mimeType eq 'multipart/alternative'
        or return $part;

    my @parts = $part->body->parts;

    # Don't know how to handle more than 2 alternatives, yet
    @parts==2
        or return $part;

    my $mt1 = $parts[0]->body->mimeType;  # smart comparison object!
    my $mt2 = $parts[1]->body->mimeType;
    my ($plain_part, $html_part)
       = $mt1 eq 'text/plain' && $mt2 eq 'text/html'  ? @parts
       : $mt1 eq 'text/html'  && $mt2 eq 'text/plain' ? @parts[1,0]
       : return $part;

    info __x"removed plain alternative for html in {msgid}"
      , msgid => $msg->messageId;

    $html_part->container(undef);
    $html_part;
}

sub _add_plain_alternative($$)
{   my ($msg, $html_part) = @_;
    my $body = $html_part->body;
    $body->mimeType eq 'text/html'
        or return $html_part;

    my $parent = $html_part->container;
    if($parent && $parent->mimeType eq 'multipart/alternative')
    {   # do we already have an plain alternative?
        $_->body->mimeType eq 'text/plain' && return $html_part
            for $parent->parts;
    }

    # XXXX The output from scrubber needs to be encoded to utf8,
    # otherwise it fails to be decoded in other code. Why is this?    
#   my $scrubber = HTML::Scrubber->new();
#   my $plain_text = encode_utf8($scrubber->scrub($body->decoded->string));
    my $plain_text = encode_utf8($body->decoded->string);

    my $plain_body = Mail::Message::Body::String->new
      ( based_on  => $body
      , mime_type => 'text/plain'
      , data      => $plain_text
      , charset   => 'utf8'
      , transfer_encoding => '8bit'
      );

    # The body's contain content info for new headers
    my $m = Mail::Message::Body::Multipart->new
      ( mime_type => 'multipart/alternative'
      , parts     => [ $plain_body, $html_part->body ]
      , preamble  => undef   # add some text?
      );

    info __x"added plain alternative for html in {msgid}"
      , msgid => $msg->messageId;

    # $html_part may be top-level msg. Keep the headers, except about content
    $html_part->body($m);
    $html_part;
}

sub _remove_html_alternative($$)
{   my ($msg, $part) = @_;
    $part->body->mimeType eq 'multipart/alternative'
        or return $part;

    my @parts = $part->parts;
    @parts==2 or return $part;

    my $mt1 = $parts[0]->body->mimeType;
    my $mt2 = $parts[1]->body->mimeType;

    my $take;
    if($mt1 eq 'text/plain')
    {   if($mt2 eq 'text/html') { $take = $parts[0] }
        elsif($mt2 eq 'multipart/related')
        {   my $subpart = $parts[1]->body->part(0);
            $take = $parts[0]
                if $subpart && $subpart->body->mimeType eq 'text/html';
        }
    }
    elsif($mt1 eq 'text/html'  && $mt2 eq 'text/plain')
    {   # The best alternative should always be at the end... however...
        $take = $parts[1];
    }

    defined $take
        or return $part;

    info __x"removed html alternative for plain in {msgid}"
      , msgid => $msg->messageId;

    $part->body($take->body);
    $part->head->add('X-SL-Modified' => 'remove_html_alternative');
    $part->container(undef);
    $part;
}

1;
