Software Freedom Law Center

Changeset 78

Show
Ignore:
Timestamp:
04/27/08 22:17:53 (2 months ago)
Author:
bkuhn
Message:

r93@hughes: bkuhn | 2008-04-27 22:17:14 -0400

  • Various improvements to handle autoformat of paragraphs and skipping
    various parts.
Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/apps/pdfdiff/Perl/lib/PDF/Extract/Text.pm

    r77 r78  
    3636 
    3737use PDF::OCR::Thorough; 
    38 use Text::Autoformat
     38use Text::Autoformat  qw(autoformat break_wrap);
    3939use Text::Aspell; 
    4040 
     
    139139  my $cutOffLen = $avgLen - 5; 
    140140  for (my $ii = 0; $ii < @lines; $ii++) { 
    141     if ($lines->[$ii] =~ /\s{10,}/) { 
     141    my $curLen = length($lines->[$ii]); 
     142    if ($lines->[$ii] =~ /\s{10,}/ or 
     143        ($lines->[$ii] =~ /\s*\d+\.\s+/ and $curLen <= $cutOffLen)) { 
    142144      # Assume that any line that starts with ten spaces or more is a 
    143145      # title, heading or other stand alone unit of some sort. 
     146 
     147      $newText .= "\n\n" if ($newText !~ /\n\n$/s or $inPara); 
     148 
    144149      $newText .= $lines->[$ii] . "\n"; 
    145150      # Add another newline if one doesn't follow 
     
    152157        if ($lines->[$ii] =~ /\-$/); 
    153158 
    154     my $curLen = length($lines->[$ii]); 
     159    $curLen = length($lines->[$ii]);  # May have changed 
    155160    if ($curLen <= $cutOffLen) { 
    156161      $newText .= $lines->[$ii] . "\n"; 
    157162      # Add another newline if one doesn't follow so the para is separated 
    158163      $newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/; 
     164      $inPara = 0; 
    159165    } else { 
    160166      $newText .= $lines->[$ii] . " "; 
     167      $inPara = 1; 
    161168    } 
    162169  } 
    163   return autoformat $newText; 
     170  return autoformat($newText, {break=>break_wrap, all=>1, left=>0, right=>72}); 
     171 
    164172} 
    165173 
     
    167175  my($self, $origFirstLine, $origSecondLine) = @_; 
    168176  my ($firstLine, $secondLine) = ($origFirstLine, $origSecondLine); 
    169   if ($firstLine =~ s/^(.*)\s+(\S+)\-\s*$/$1/) { 
     177  if ($firstLine =~ s/^(.*\s+[\[\(,]*)(\S+)\-\s*$/$1/) { 
    170178    my $word = $2; 
    171     if ($secondLine =~ s/^\s*(\S+)([\s\.\,]+.*)$/$2/) { 
     179    if ($secondLine =~ s/^\s*(\w+)([\s\.\,\)\]]+)(.*)$/$3/) { 
    172180      $word .= $1; 
    173       return ("$firstLine$word", $secondLine) 
     181      my $buffer = $2; 
     182      my $firstLineRebuild = "$firstLine$word"; 
     183      $firstLineRebuild .= $buffer unless ($buffer =~ /^\s*$/); 
     184      return ("$firstLineRebuild", $secondLine) 
    174185        if ($self->{speller}->check($word)); 
    175186    } 

SFLC Main Page

[frdm] Support SFLC