@@ -164,97 +164,109 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
164
164
<< " HEIGHT=\" " << rect_height_ << " \" >\n " ;
165
165
166
166
ResultIterator* res_it = GetIterator ();
167
- while (!res_it->Empty (RIL_BLOCK)) {
168
- if (res_it->Empty (RIL_WORD)) {
169
- res_it->Next (RIL_WORD);
170
- continue ;
167
+ for (; !res_it->Empty (RIL_BLOCK); res_it->Next (RIL_BLOCK)) {
168
+ alto_str << " \t\t\t\t <ComposedBlock ID=\" cblock_" << bcnt << " \" " ;
169
+ AddBoxToAlto (res_it, RIL_BLOCK, alto_str);
170
+ alto_str << " \n " ;
171
+
172
+ const char * block_type;
173
+ switch (res_it->BlockType ()) {
174
+ case PT_FLOWING_TEXT:
175
+ case PT_HEADING_TEXT:
176
+ case PT_PULLOUT_TEXT:
177
+ case PT_CAPTION_TEXT:
178
+ case PT_VERTICAL_TEXT:
179
+ case PT_TABLE: // nothing special here
180
+ case PT_EQUATION:
181
+ case PT_INLINE_EQUATION:
182
+ block_type = " TextBlock" ;
183
+ break ;
184
+ case PT_FLOWING_IMAGE:
185
+ case PT_HEADING_IMAGE:
186
+ case PT_PULLOUT_IMAGE:
187
+ block_type = " Illustration" ;
188
+ break ;
189
+ case PT_HORZ_LINE:
190
+ case PT_VERT_LINE:
191
+ block_type = " GraphicalElement" ;
192
+ break ;
193
+ default :
194
+ block_type = " ComposedBlock" ;
171
195
}
172
196
173
- if (res_it->IsAtBeginningOf (RIL_BLOCK)) {
174
- alto_str << " \t\t\t\t <ComposedBlock ID=\" cblock_" << bcnt << " \" " ;
175
- AddBoxToAlto (res_it, RIL_BLOCK, alto_str);
176
- alto_str << " \n " ;
177
- }
178
-
179
- if (res_it->IsAtBeginningOf (RIL_PARA)) {
180
- alto_str << " \t\t\t\t\t <TextBlock ID=\" block_" << tcnt << " \" " ;
197
+ for (; !res_it->Empty (RIL_PARA); res_it->Next (RIL_PARA)) {
198
+ alto_str << " \t\t\t\t\t <" << block_type << " ID=\" block_" << tcnt << " \" " ;
181
199
AddBoxToAlto (res_it, RIL_PARA, alto_str);
182
200
alto_str << " \n " ;
183
- }
184
-
185
- if (res_it->IsAtBeginningOf (RIL_TEXTLINE)) {
186
- alto_str << " \t\t\t\t\t\t <TextLine ID=\" line_" << lcnt << " \" " ;
187
- AddBoxToAlto (res_it, RIL_TEXTLINE, alto_str);
188
- alto_str << " \n " ;
189
- }
190
-
191
- alto_str << " \t\t\t\t\t\t\t <String ID=\" string_" << wcnt << " \" " ;
192
- AddBoxToAlto (res_it, RIL_WORD, alto_str);
193
- alto_str << " CONTENT=\" " << HOcrEscape (res_it->GetUTF8Text (RIL_WORD)).c_str () << " \" >" ;
194
-
195
- bool last_word_in_line = res_it->IsAtFinalElement (RIL_TEXTLINE, RIL_WORD);
196
- bool last_word_in_tblock = res_it->IsAtFinalElement (RIL_PARA, RIL_WORD);
197
- bool last_word_in_cblock = res_it->IsAtFinalElement (RIL_BLOCK, RIL_WORD);
198
201
199
-
200
- int left, top, right, bottom;
201
- res_it->BoundingBox (RIL_WORD, &left, &top, &right, &bottom);
202
-
203
- do {
204
- alto_str << " \n\t\t\t\t\t\t\t\t <Glyph ID=\" glyph_" << scnt << " \" " ;
205
- AddBoxToAlto (res_it, RIL_SYMBOL, alto_str);
206
- alto_str << " CONTENT=\" " ;
207
- const std::unique_ptr<const char []> grapheme (
208
- res_it->GetUTF8Text (RIL_SYMBOL));
209
- if (grapheme && grapheme[0 ] != 0 ) {
210
- alto_str << HOcrEscape (grapheme.get ()).c_str ();
202
+ if (strcmp (block_type, " TextBlock" ) == 0 ) {
203
+ for (; !res_it->Empty (RIL_TEXTLINE); res_it->Next (RIL_TEXTLINE)) {
204
+ alto_str << " \t\t\t\t\t\t <TextLine ID=\" line_" << lcnt << " \" " ;
205
+ AddBoxToAlto (res_it, RIL_TEXTLINE, alto_str);
206
+ alto_str << " \n " ;
207
+
208
+ for (; !res_it->Empty (RIL_WORD); res_it->Next (RIL_WORD)) {
209
+ int left = 0 , top = 0 , right = 0 , bottom = 0 ;
210
+ if (!res_it->IsAtBeginningOf (RIL_TEXTLINE)) {
211
+ int hpos = right;
212
+ int vpos = top;
213
+ res_it->BoundingBox (RIL_WORD, &left, &top, &right, &bottom);
214
+ int width = left - hpos;
215
+ int height = bottom - top;
216
+ alto_str << " \n\t\t\t\t\t\t\t <SP" ;
217
+ alto_str << " HPOS=\" " << hpos << " \" " ;
218
+ alto_str << " VPOS=\" " << vpos << " \" " ;
219
+ alto_str << " WIDTH=\" " << width << " \" " ;
220
+ alto_str << " HEIGHT=\" " << height << " \" />\n " ;
221
+ }
222
+ res_it->BoundingBox (RIL_WORD, &left, &top, &right, &bottom);
223
+
224
+ alto_str << " \t\t\t\t\t\t\t <String ID=\" string_" << wcnt << " \" " ;
225
+ AddBoxToAlto (res_it, RIL_WORD, alto_str);
226
+ alto_str << " CONTENT=\" " << HOcrEscape (res_it->GetUTF8Text (RIL_WORD)).c_str () << " \" >" ;
227
+
228
+ for (; !res_it->Empty (RIL_SYMBOL); res_it->Next (RIL_SYMBOL)) {
229
+ alto_str << " \n\t\t\t\t\t\t\t\t <Glyph ID=\" glyph_" << scnt << " \" " ;
230
+ AddBoxToAlto (res_it, RIL_SYMBOL, alto_str);
231
+ alto_str << " CONTENT=\" " ;
232
+ const std::unique_ptr<const char []> grapheme (res_it->GetUTF8Text (RIL_SYMBOL));
233
+ if (grapheme && grapheme[0 ] != 0 )
234
+ alto_str << HOcrEscape (grapheme.get ()).c_str ();
235
+ alto_str << " \" >" ;
236
+
237
+ ChoiceIterator choice_it (*res_it);
238
+ do {
239
+ int vc = choice_it.Confidence ();
240
+ alto_str << " \n\t\t\t\t\t\t\t\t\t <Variant VC=\" 0." << vc << " \" " ;
241
+ alto_str << " CONTENT=\" " ;
242
+ const char * variant = choice_it.GetUTF8Text ();
243
+ if (variant && variant[0 ] != 0 )
244
+ alto_str << HOcrEscape (variant).c_str ();
245
+ alto_str << " \" />" ;
246
+ } while (choice_it.Next ());
247
+ alto_str << " \n\t\t\t\t\t\t\t\t </Glyph>" ;
248
+ scnt++;
249
+ if (res_it->IsAtFinalElement (RIL_WORD, RIL_SYMBOL))
250
+ break ;
251
+ }
252
+ alto_str << " \n\t\t\t\t\t\t\t </String>" ;
253
+ wcnt++;
254
+ if (res_it->IsAtFinalElement (RIL_TEXTLINE, RIL_WORD))
255
+ break ;
256
+ }
257
+ alto_str << " \n\t\t\t\t\t\t </TextLine>\n " ;
258
+ lcnt++;
259
+ if (res_it->IsAtFinalElement (RIL_PARA, RIL_TEXTLINE))
260
+ break ;
261
+ }
211
262
}
212
- alto_str << " \" >" ;
213
- ChoiceIterator choice_it (*res_it);
214
- do {
215
- int vc = choice_it.Confidence ();
216
- alto_str << " \n\t\t\t\t\t\t\t\t\t <Variant VC=\" 0." << vc << " \" " ;
217
- alto_str << " CONTENT=\" " ;
218
- const char * variant = choice_it.GetUTF8Text ();
219
- if (variant && variant[0 ] != 0 )
220
- alto_str << HOcrEscape (variant).c_str ();
221
- alto_str << " \" />" ;
222
- } while (choice_it.Next ());
223
- alto_str << " \n\t\t\t\t\t\t\t\t </Glyph>" ;
224
- res_it->Next (RIL_SYMBOL);
225
-
226
- scnt++;
227
- } while (!res_it->Empty (RIL_BLOCK) && !res_it->IsAtBeginningOf (RIL_WORD));
228
-
229
- alto_str << " \n\t\t\t\t\t\t\t </String>" ;
230
-
231
- wcnt++;
232
-
233
- if (last_word_in_line) {
234
- alto_str << " \n\t\t\t\t\t\t </TextLine>\n " ;
235
- lcnt++;
236
- } else {
237
- int hpos = right;
238
- int vpos = top;
239
- res_it->BoundingBox (RIL_WORD, &left, &top, &right, &bottom);
240
- int width = left - hpos;
241
- int height = bottom - top;
242
- alto_str << " \n\t\t\t\t\t\t\t <SP" ;
243
- alto_str << " HPOS=\" " << hpos << " \" " ;
244
- alto_str << " VPOS=\" " << vpos << " \" " ;
245
- alto_str << " WIDTH=\" " << width << " \" " ;
246
- alto_str << " HEIGHT=\" " << height << " \" />\n " ;
247
- }
248
-
249
- if (last_word_in_tblock) {
250
- alto_str << " \t\t\t\t\t </TextBlock>\n " ;
263
+ alto_str << " \t\t\t\t\t </" << block_type << " >\n " ;
251
264
tcnt++;
265
+ if (res_it->IsAtFinalElement (RIL_BLOCK, RIL_PARA))
266
+ break ;
252
267
}
253
-
254
- if (last_word_in_cblock) {
255
- alto_str << " \t\t\t\t </ComposedBlock>\n " ;
256
- bcnt++;
257
- }
268
+ alto_str << " \t\t\t\t </ComposedBlock>\n " ;
269
+ bcnt++;
258
270
}
259
271
260
272
alto_str << " \t\t\t </PrintSpace>\n "
0 commit comments