| 108 | | return $self->{ocrObj}->get_text; |
|---|
| | 116 | my $text = $self->{ocrObj}->get_text; |
|---|
| | 117 | my @lines = split(/\n/, $text); |
|---|
| | 118 | my $lines = \@lines; |
|---|
| | 119 | my $newText; |
|---|
| | 120 | |
|---|
| | 121 | # First, fine out the average length of a line. |
|---|
| | 122 | my $count = 0; |
|---|
| | 123 | my $totLen = 0; |
|---|
| | 124 | for (my $ii = 0; $ii < @lines; $ii++) { |
|---|
| | 125 | $lines->[$ii] =~ s/^\s*//; $lines->[$ii] =~ s/\s*$//; |
|---|
| | 126 | if ($lines->[$ii] !~ /^\s*$/) { |
|---|
| | 127 | $count++; |
|---|
| | 128 | $totLen += length($lines->[$ii]); |
|---|
| | 129 | } |
|---|
| | 130 | } |
|---|
| | 131 | my $avgLen = $totLen / $count; |
|---|
| | 132 | |
|---|
| | 133 | # Now, the loop that: |
|---|
| | 134 | # (a) tries to find paragraphs |
|---|
| | 135 | # (b) attempts to un-hyphenate words |
|---|
| | 136 | |
|---|
| | 137 | my $inPara = 0; |
|---|
| | 138 | my $cutOffLen = $avgLen - 5; |
|---|
| | 139 | for (my $ii = 0; $ii < @lines; $ii++) { |
|---|
| | 140 | if ($line->{$ii} =~ /\s{10,}/) { |
|---|
| | 141 | # Assume that any line that starts with ten spaces or more is a |
|---|
| | 142 | # title, heading or other stand alone unit of some sort. |
|---|
| | 143 | $newText .= $line->{$ii} . "\n"; |
|---|
| | 144 | # Add another newline if one doesn't follow |
|---|
| | 145 | $newText .= "\n" unless $line->{$ii+1} =~ /^\s*$/; |
|---|
| | 146 | $inPara = 0; |
|---|
| | 147 | next; |
|---|
| | 148 | } |
|---|
| | 149 | ($line->{$ii},$line->{$ii+1}) = |
|---|
| | 150 | $self->_handleDeHyphen($line->{$ii},$line->{$ii+1}) |
|---|
| | 151 | if ($line->{$ii} =~ /\-$/); |
|---|
| | 152 | |
|---|
| | 153 | if ($curLen <= $cutOffLen) { |
|---|
| | 154 | $newText .= $line->{$ii} . "\n"; |
|---|
| | 155 | # Add another newline if one doesn't follow so the para is separated |
|---|
| | 156 | $newText .= "\n" unless $line->{$ii+1} =~ /^\s*$/; |
|---|
| | 157 | } else { |
|---|
| | 158 | $newText .= $line->{$ii} . " "; |
|---|
| | 159 | } |
|---|
| | 160 | } |
|---|
| | 161 | return autoformat $newText; |
|---|
| | 164 | sub _handleDeHyphen { |
|---|
| | 165 | my($self, $origFirstLine, $origSecondLine) = @_; |
|---|
| | 166 | my ($firstLine, $secondLine) = ($origFirstLine, $origSecondLine); |
|---|
| | 167 | if ($firstLine =~ /^(.*)\s+(\S+)\-\s*$/$1/) { |
|---|
| | 168 | my $word = $2; |
|---|
| | 169 | if ($secondLine =~ /^\s*(\S+)([\s\.\,]+.*)$/$2/) { |
|---|
| | 170 | $word .= $1; |
|---|
| | 171 | return ("$firstLine$word", $secondLine) |
|---|
| | 172 | if ($self->{speller}->check($word)); |
|---|
| | 173 | } |
|---|
| | 174 | } |
|---|
| | 175 | return ($origFirstLine, $origSecondLine); |
|---|
| | 176 | } |
|---|