Changeset 73
- Timestamp:
- 04/27/08 22:17:39 (3 months ago)
- Files:
-
- trunk/apps/pdfdiff/Perl/lib/PDF/Extract/Text.pm (modified) (3 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/apps/pdfdiff/Perl/lib/PDF/Extract/Text.pm
r72 r73 116 116 my $text = $self->{ocrObj}->get_text; 117 117 my @lines = split(/\n/, $text); 118 my $line s= \@lines;118 my $line = \@lines; 119 119 my $newText; 120 120 … … 138 138 my $cutOffLen = $avgLen - 5; 139 139 for (my $ii = 0; $ii < @lines; $ii++) { 140 if ($line ->{$ii}=~ /\s{10,}/) {140 if ($lines->[$ii] =~ /\s{10,}/) { 141 141 # Assume that any line that starts with ten spaces or more is a 142 142 # title, heading or other stand alone unit of some sort. 143 $newText .= $line ->{$ii}. "\n";143 $newText .= $lines->[$ii] . "\n"; 144 144 # Add another newline if one doesn't follow 145 $newText .= "\n" unless $line ->{$ii+1}=~ /^\s*$/;145 $newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/; 146 146 $inPara = 0; 147 147 next; 148 148 } 149 ($line ->{$ii},$line->{$ii+1}) =150 $self->_handleDeHyphen($line ->{$ii},$line->{$ii+1})151 if ($line ->{$ii}=~ /\-$/);149 ($lines->[$ii],$lines->[$ii+1]) = 150 $self->_handleDeHyphen($lines->[$ii],$lines->[$ii+1]) 151 if ($lines->[$ii] =~ /\-$/); 152 152 153 153 if ($curLen <= $cutOffLen) { 154 $newText .= $line ->{$ii}. "\n";154 $newText .= $lines->[$ii] . "\n"; 155 155 # Add another newline if one doesn't follow so the para is separated 156 $newText .= "\n" unless $line ->{$ii+1}=~ /^\s*$/;156 $newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/; 157 157 } else { 158 $newText .= $line ->{$ii}. " ";158 $newText .= $lines->[$ii] . " "; 159 159 } 160 160 } … … 165 165 my($self, $origFirstLine, $origSecondLine) = @_; 166 166 my ($firstLine, $secondLine) = ($origFirstLine, $origSecondLine); 167 if ($firstLine =~ /^(.*)\s+(\S+)\-\s*$/$1/) {167 if ($firstLine =~ s/^(.*)\s+(\S+)\-\s*$/$1/) { 168 168 my $word = $2; 169 if ($secondLine =~ /^\s*(\S+)([\s\.\,]+.*)$/$2/) {169 if ($secondLine =~ s/^\s*(\S+)([\s\.\,]+.*)$/$2/) { 170 170 $word .= $1; 171 171 return ("$firstLine$word", $secondLine)