Changeset 78
- Timestamp:
- 04/27/08 22:17:53 (2 months ago)
- Files:
-
- trunk/apps/pdfdiff/Perl/lib/PDF/Extract/Text.pm (modified) (4 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/apps/pdfdiff/Perl/lib/PDF/Extract/Text.pm
r77 r78 36 36 37 37 use PDF::OCR::Thorough; 38 use Text::Autoformat ;38 use Text::Autoformat qw(autoformat break_wrap);; 39 39 use Text::Aspell; 40 40 … … 139 139 my $cutOffLen = $avgLen - 5; 140 140 for (my $ii = 0; $ii < @lines; $ii++) { 141 if ($lines->[$ii] =~ /\s{10,}/) { 141 my $curLen = length($lines->[$ii]); 142 if ($lines->[$ii] =~ /\s{10,}/ or 143 ($lines->[$ii] =~ /\s*\d+\.\s+/ and $curLen <= $cutOffLen)) { 142 144 # Assume that any line that starts with ten spaces or more is a 143 145 # title, heading or other stand alone unit of some sort. 146 147 $newText .= "\n\n" if ($newText !~ /\n\n$/s or $inPara); 148 144 149 $newText .= $lines->[$ii] . "\n"; 145 150 # Add another newline if one doesn't follow … … 152 157 if ($lines->[$ii] =~ /\-$/); 153 158 154 my $curLen = length($lines->[$ii]);159 $curLen = length($lines->[$ii]); # May have changed 155 160 if ($curLen <= $cutOffLen) { 156 161 $newText .= $lines->[$ii] . "\n"; 157 162 # Add another newline if one doesn't follow so the para is separated 158 163 $newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/; 164 $inPara = 0; 159 165 } else { 160 166 $newText .= $lines->[$ii] . " "; 167 $inPara = 1; 161 168 } 162 169 } 163 return autoformat $newText; 170 return autoformat($newText, {break=>break_wrap, all=>1, left=>0, right=>72}); 171 164 172 } 165 173 … … 167 175 my($self, $origFirstLine, $origSecondLine) = @_; 168 176 my ($firstLine, $secondLine) = ($origFirstLine, $origSecondLine); 169 if ($firstLine =~ s/^(.* )\s+(\S+)\-\s*$/$1/) {177 if ($firstLine =~ s/^(.*\s+[\[\(,]*)(\S+)\-\s*$/$1/) { 170 178 my $word = $2; 171 if ($secondLine =~ s/^\s*(\ S+)([\s\.\,]+.*)$/$2/) {179 if ($secondLine =~ s/^\s*(\w+)([\s\.\,\)\]]+)(.*)$/$3/) { 172 180 $word .= $1; 173 return ("$firstLine$word", $secondLine) 181 my $buffer = $2; 182 my $firstLineRebuild = "$firstLine$word"; 183 $firstLineRebuild .= $buffer unless ($buffer =~ /^\s*$/); 184 return ("$firstLineRebuild", $secondLine) 174 185 if ($self->{speller}->check($word)); 175 186 }