Skip to content

Commit 1dab4e0

Browse files
committed
ALTO renderer: use proper BlockTypes
- use TextBlock, Illustration, GraphicalElement (not just TextBlock), as appropriate for the internal block types - do not enter RIL_TEXTLINE, RIL_WORD, RIL_SYMBOL and ChoiceIterator on anything other than TextBlocks - refactor loop to make it more readable
1 parent caff12e commit 1dab4e0

File tree

1 file changed

+95
-83
lines changed

1 file changed

+95
-83
lines changed

src/api/altorenderer.cpp

+95-83
Original file line numberDiff line numberDiff line change
@@ -164,97 +164,109 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
164164
<< " HEIGHT=\"" << rect_height_ << "\">\n";
165165

166166
ResultIterator* res_it = GetIterator();
167-
while (!res_it->Empty(RIL_BLOCK)) {
168-
if (res_it->Empty(RIL_WORD)) {
169-
res_it->Next(RIL_WORD);
170-
continue;
167+
for (; !res_it->Empty(RIL_BLOCK); res_it->Next(RIL_BLOCK)) {
168+
alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
169+
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
170+
alto_str << "\n";
171+
172+
const char* block_type;
173+
switch (res_it->BlockType()) {
174+
case PT_FLOWING_TEXT:
175+
case PT_HEADING_TEXT:
176+
case PT_PULLOUT_TEXT:
177+
case PT_CAPTION_TEXT:
178+
case PT_VERTICAL_TEXT:
179+
case PT_TABLE: // nothing special here
180+
case PT_EQUATION:
181+
case PT_INLINE_EQUATION:
182+
block_type = "TextBlock";
183+
break;
184+
case PT_FLOWING_IMAGE:
185+
case PT_HEADING_IMAGE:
186+
case PT_PULLOUT_IMAGE:
187+
block_type = "Illustration";
188+
break;
189+
case PT_HORZ_LINE:
190+
case PT_VERT_LINE:
191+
block_type = "GraphicalElement";
192+
break;
193+
default:
194+
block_type = "ComposedBlock";
171195
}
172196

173-
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
174-
alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
175-
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
176-
alto_str << "\n";
177-
}
178-
179-
if (res_it->IsAtBeginningOf(RIL_PARA)) {
180-
alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
197+
for (; !res_it->Empty(RIL_PARA); res_it->Next(RIL_PARA)) {
198+
alto_str << "\t\t\t\t\t<" << block_type << " ID=\"block_" << tcnt << "\"";
181199
AddBoxToAlto(res_it, RIL_PARA, alto_str);
182200
alto_str << "\n";
183-
}
184-
185-
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
186-
alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
187-
AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
188-
alto_str << "\n";
189-
}
190-
191-
alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
192-
AddBoxToAlto(res_it, RIL_WORD, alto_str);
193-
alto_str << " CONTENT=\"" << HOcrEscape(res_it->GetUTF8Text(RIL_WORD)).c_str() << "\">";
194-
195-
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
196-
bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
197-
bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
198201

199-
200-
int left, top, right, bottom;
201-
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
202-
203-
do {
204-
alto_str << "\n\t\t\t\t\t\t\t\t<Glyph ID=\"glyph_" << scnt << "\"";
205-
AddBoxToAlto(res_it, RIL_SYMBOL, alto_str);
206-
alto_str << " CONTENT=\"";
207-
const std::unique_ptr<const char[]> grapheme(
208-
res_it->GetUTF8Text(RIL_SYMBOL));
209-
if (grapheme && grapheme[0] != 0) {
210-
alto_str << HOcrEscape(grapheme.get()).c_str();
202+
if (strcmp(block_type, "TextBlock") == 0) {
203+
for (; !res_it->Empty(RIL_TEXTLINE); res_it->Next(RIL_TEXTLINE)) {
204+
alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
205+
AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
206+
alto_str << "\n";
207+
208+
for (; !res_it->Empty(RIL_WORD); res_it->Next(RIL_WORD)) {
209+
int left = 0, top = 0, right = 0, bottom = 0;
210+
if (!res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
211+
int hpos = right;
212+
int vpos = top;
213+
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
214+
int width = left - hpos;
215+
int height = bottom - top;
216+
alto_str << "\n\t\t\t\t\t\t\t<SP";
217+
alto_str << " HPOS=\"" << hpos << "\"";
218+
alto_str << " VPOS=\"" << vpos << "\"";
219+
alto_str << " WIDTH=\"" << width << "\"";
220+
alto_str << " HEIGHT=\"" << height << "\"/>\n";
221+
}
222+
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
223+
224+
alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
225+
AddBoxToAlto(res_it, RIL_WORD, alto_str);
226+
alto_str << " CONTENT=\"" << HOcrEscape(res_it->GetUTF8Text(RIL_WORD)).c_str() << "\">";
227+
228+
for (; !res_it->Empty(RIL_SYMBOL); res_it->Next(RIL_SYMBOL)) {
229+
alto_str << "\n\t\t\t\t\t\t\t\t<Glyph ID=\"glyph_" << scnt << "\"";
230+
AddBoxToAlto(res_it, RIL_SYMBOL, alto_str);
231+
alto_str << " CONTENT=\"";
232+
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
233+
if (grapheme && grapheme[0] != 0)
234+
alto_str << HOcrEscape(grapheme.get()).c_str();
235+
alto_str << "\">";
236+
237+
ChoiceIterator choice_it(*res_it);
238+
do {
239+
int vc = choice_it.Confidence();
240+
alto_str << "\n\t\t\t\t\t\t\t\t\t<Variant VC=\"0." << vc << "\"";
241+
alto_str << " CONTENT=\"";
242+
const char* variant = choice_it.GetUTF8Text();
243+
if (variant && variant[0] != 0)
244+
alto_str << HOcrEscape(variant).c_str();
245+
alto_str << "\"/>";
246+
} while (choice_it.Next());
247+
alto_str << "\n\t\t\t\t\t\t\t\t</Glyph>";
248+
scnt++;
249+
if (res_it->IsAtFinalElement(RIL_WORD, RIL_SYMBOL))
250+
break;
251+
}
252+
alto_str << "\n\t\t\t\t\t\t\t</String>";
253+
wcnt++;
254+
if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD))
255+
break;
256+
}
257+
alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
258+
lcnt++;
259+
if (res_it->IsAtFinalElement(RIL_PARA, RIL_TEXTLINE))
260+
break;
261+
}
211262
}
212-
alto_str << "\">";
213-
ChoiceIterator choice_it(*res_it);
214-
do {
215-
int vc = choice_it.Confidence();
216-
alto_str << "\n\t\t\t\t\t\t\t\t\t<Variant VC=\"0." << vc << "\"";
217-
alto_str << " CONTENT=\"";
218-
const char* variant = choice_it.GetUTF8Text();
219-
if (variant && variant[0] != 0)
220-
alto_str << HOcrEscape(variant).c_str();
221-
alto_str << "\"/>";
222-
} while (choice_it.Next());
223-
alto_str << "\n\t\t\t\t\t\t\t\t</Glyph>";
224-
res_it->Next(RIL_SYMBOL);
225-
226-
scnt++;
227-
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
228-
229-
alto_str << "\n\t\t\t\t\t\t\t</String>";
230-
231-
wcnt++;
232-
233-
if (last_word_in_line) {
234-
alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
235-
lcnt++;
236-
} else {
237-
int hpos = right;
238-
int vpos = top;
239-
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
240-
int width = left - hpos;
241-
int height = bottom - top;
242-
alto_str << "\n\t\t\t\t\t\t\t<SP";
243-
alto_str << " HPOS=\"" << hpos << "\"";
244-
alto_str << " VPOS=\"" << vpos << "\"";
245-
alto_str << " WIDTH=\"" << width << "\"";
246-
alto_str << " HEIGHT=\"" << height << "\"/>\n";
247-
}
248-
249-
if (last_word_in_tblock) {
250-
alto_str << "\t\t\t\t\t</TextBlock>\n";
263+
alto_str << "\t\t\t\t\t</" << block_type << ">\n";
251264
tcnt++;
265+
if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_PARA))
266+
break;
252267
}
253-
254-
if (last_word_in_cblock) {
255-
alto_str << "\t\t\t\t</ComposedBlock>\n";
256-
bcnt++;
257-
}
268+
alto_str << "\t\t\t\t</ComposedBlock>\n";
269+
bcnt++;
258270
}
259271

260272
alto_str << "\t\t\t</PrintSpace>\n"

0 commit comments

Comments
 (0)