Language, Character Set and Code Page Handling Module. More...

Classes
struct	LANGUAGE_INFO
	Language info. More...

Macros
#define	UNICODE_REJECTED 0x0fffd
	The rejected symbol in Unicode strings. See kRecSetRejectionSymbol and kRecGetRejectionSymbol.

#define	UNICODE_MISSING 0x0fffc
	The missing symbol in Unicode strings. See kRecSetMissingSymbol and kRecGetMissingSymbol.

#define	MAXCPNAMELEN 32
	Maximal buffer length needed for Code Page name.

Typedefs
typedef enum LANG_ENA	LANG_ENA
	Language enable/disable.

typedef enum OUTCODEPAGETYPE	OUTCODEPAGETYPE
	Code page types.

typedef OUTCODEPAGETYPE *	LPOUTCODEPAGETYPE
	Pointer to an OUTCODEPAGETYPE.

Enumerations
enum	CHR_FILTER { FILTER_DEFAULT = 0 , FILTER_DIGIT = 1 , FILTER_UPPERCASE = 2 , FILTER_LOWERCASE = 4 , FILTER_PUNCTUATION = 8 , FILTER_MISCELLANEOUS = 16 , FILTER_PLUS = 32 , FILTER_PLUS_1 = 128 , FILTER_PLUS_2 = 256 , FILTER_PLUS_3 = 512 , FILTER_USER_DICT = 64 , FILTER_WESTERN = 1024 , FILTER_ALL , FILTER_ALPHA = (FILTER_UPPERCASE \| FILTER_LOWERCASE) , FILTER_NUMBERS = (FILTER_DIGIT \| FILTER_PLUS) , FILTER_SIZE = 2048 }
	Recognition filters. More...

enum	LANG_ENA { LANG_DISABLED = 0 , LANG_ENABLED }
	Language enable/disable. More...

enum	MANAGE_LANG { SET_LANG = 0 , ADD_LANG , REMOVE_LANG , INVERT_LANG , IS_LANG_ENABLED }
	Language management actions. More...

enum	LANGUAGES { LANG_ALL = -1024 , LANG_ALL_LATIN = -1023 , LANG_ALL_ASIAN = -1022 , LANG_START = -3 , LANG_UD = -3 , LANG_AUTO = -2 , LANG_NO = -1 , LANG_ENG = 0 , LANG_GER , LANG_FRE , LANG_DUT , LANG_NOR , LANG_SWE , LANG_FIN , LANG_DAN , LANG_ICE , LANG_POR , LANG_SPA , LANG_CAT , LANG_GAL , LANG_ITA , LANG_MAL , LANG_GRE , LANG_POL , LANG_CZH , LANG_SLK , LANG_HUN , LANG_SLN , LANG_CRO , LANG_ROM , LANG_ALB , LANG_TUR , LANG_EST , LANG_LAT , LANG_LIT , LANG_ESP , LANG_SRL , LANG_SRB , LANG_MAC , LANG_MOL , LANG_BUL , LANG_BEL , LANG_UKR , LANG_RUS , LANG_CHE , LANG_KAB , LANG_AFR , LANG_AYM , LANG_BAS , LANG_BEM , LANG_BLA , LANG_BRE , LANG_BRA , LANG_BUG , LANG_CHA , LANG_CHU , LANG_COR , LANG_CRW , LANG_ESK , LANG_FAR , LANG_FIJ , LANG_FRI , LANG_FRU , LANG_GLI , LANG_GLS , LANG_GAN , LANG_GUA , LANG_HAN , LANG_HAW , LANG_IDO , LANG_IND , LANG_INT , LANG_KAS , LANG_KAW , LANG_KIK , LANG_KON , LANG_KPE , LANG_KUR , LANG_LTN , LANG_LUB , LANG_LUX , LANG_MLG , LANG_MLY , LANG_MLN , LANG_MAO , LANG_MAY , LANG_MIA , LANG_MIN , LANG_MOH , LANG_NAH , LANG_NYA , LANG_OCC , LANG_OJI , LANG_PAP , LANG_PID , LANG_PRO , LANG_QUE , LANG_RHA , LANG_ROY , LANG_RUA , LANG_RUN , LANG_SAM , LANG_SAR , LANG_SHO , LANG_SIO , LANG_SMI , LANG_SML , LANG_SMN , LANG_SMS , LANG_SOM , LANG_SOT , LANG_SUN , LANG_SWA , LANG_SWZ , LANG_TAG , LANG_TAH , LANG_TIN , LANG_TON , LANG_TUN , LANG_VIS , LANG_WEL , LANG_WEN , LANG_WOL , LANG_XHO , LANG_ZAP , LANG_ZUL , LANG_JPN , LANG_CHS , LANG_CHT , LANG_KRN , LANG_THA , LANG_ARA , LANG_HEB , LANG_VIE , LANG_SIZE }
	Possible languages. More...

enum	CONTINENT { C_EUROPE = 0x0001 , C_ASIA = 0x0002 , C_AFRICA = 0x0004 , C_OCEANIA = 0x0008 , C_LAMERICA = 0x0010 , C_NAMERICA = 0x0020 , C_INTERNATIONAL =0x0040 , C_EURASIA = C_EUROPE \| C_ASIA , C_EURNAM = C_EUROPE \| C_NAMERICA , C_EURLAM = C_EUROPE \| C_LAMERICA , C_EURNAMOCE = C_EUROPE \| C_NAMERICA \| C_OCEANIA , C_WORLD = C_EUROPE \| C_ASIA \| C_AFRICA \| C_OCEANIA \| C_LAMERICA \| C_NAMERICA \| C_INTERNATIONAL }
	Continent ID. More...

enum	BASIC_LANGUAGE_CHARSET { B_OTH = 0 , B_BAS = 1 , B_LAT = 2 , B_GRE = 4 , B_CYR = 8 , B_ASN = 16 , B_RTL = 32 }
	Character set bases. More...

enum	RM_FLAGS
	Recognition Engines supporting a language. More...

enum	LANGUAGE_CODE { LANGCODE_ALL = 0 , LANGCODE_ENGLISH , LANGCODE_INT_3 , LANGCODE_639_1 , LANGCODE_639_2B , LANGCODE_639_3 , LANGCODE_WIN_3 , LANGCODE_BCP_47 }
	Language code type. More...

enum	OUTCODEPAGETYPE { CODEP_UNKNOWN = 0 , SPECIFIC , ASCII_BASED , ANSI_BASED , MAC_BASED , INTERNAL_CP , ASIAN_CODEPAGE }
	Code page types. More...

Functions
RECERR RECAPIKRN	kRecSetLanguages (int sid, const LANG_ENA *pLanguages)
	Setting languages.

RECERR RECAPIKRN	kRecGetLanguages (int sid, LANG_ENA *pLanguagesOut)
	Getting languages.

RECERR RECAPIKRN	kRecManageLanguages (int sid, MANAGE_LANG action, LANGUAGES language)
	Managing enabled languages.

RECERR RECAPIKRN	kRecSetSingleLanguageDetection (int sid, INTBOOL bEnable)
	Automatic Single Language Detection.

RECERR RECAPIKRN	kRecGetSingleLanguageDetection (int sid, INTBOOL *pbEnable)
	Getting the single language detection flag.

RECERR RECAPIKRN	kRecGetPageLanguages (HPAGE hPage, LANG_ENA *pOcrLanguagesOut)
	Getting languages of the page.

RECERR RECAPIKRN	kRecGetLanguageInfo (LANGUAGES lang, LANGUAGE_INFO *pInfo)
	Getting information about a language.

RECERR RECAPIKRN	kRecFindLanguages (const LANGUAGE_INFO pInfo, LANG_ENA pLanguagesOut)
	Searching for languages.

RECERR RECAPIKRN	kRecFindLanguage (LPCTSTR pLangName, LANGUAGES *pLanguage)
	Searching for languages.

RECERR RECAPIKRN	kRecFindLanguageEx (LANGUAGE_CODE coding, LPCTSTR pLangName, LANGUAGES pLanguage, LANG_ENA pLanguagesOut)
	Searching for languages.

RECERR RECAPIKRN	kRecSetLanguagesPlus (int sid, LPCWSTR pOcrLplus)
	Setting LanguagesPlus characters.

RECERR RECAPIKRN	kRecGetLanguagesPlus (int sid, LPWSTR pOcrLplus, size_t iBSize)
	Getting LanguagesPlus characters.

RECERR RECAPIKRN	kRecSetDefaultFilter (int sid, CHR_FILTER Glfilter)
	Changing global character set filter.

RECERR RECAPIKRN	kRecGetDefaultFilter (int sid, CHR_FILTER *pGlfilter)
	Getting the global character set filter.

RECERR RECAPIKRN	kRecSetFilterPlus (int sid, LPCWSTR pFilterPlus)
	Setting FilterPlus characters.

RECERR RECAPIKRN	kRecGetFilterPlus (int sid, LPWSTR pFilterPlus, size_t iSize)
	Getting FilterPlus characters.

RECERR RECAPIKRN	kRecSetFilterPlusEx (int sid, int index, LPCTSTR pFilterPlus)
	Setting FilterPlus characters, extended.

RECERR RECAPIKRN	kRecSetRejectionSymbol (int sid, WCHAR wRej)
	Setting the rejection symbol character.

RECERR RECAPIKRN	kRecGetRejectionSymbol (int sid, LPWCH pwRej)
	Getting the rejection symbol character.

RECERR RECAPIKRN	kRecSetMissingSymbol (int sid, WCHAR wMiss)
	Setting the missing symbol character.

RECERR RECAPIKRN	kRecGetMissingSymbol (int sid, LPWCH pwMiss)
	Getting missing symbol character.

RECERR RECAPIKRN	kRecSetCodePage (int sid, LPCTSTR pCodePageName)
	Setting the code page.

RECERR RECAPIKRN	kRecGetCodePage (int sid, LPTSTR pCodePageName, size_t buflen)
	Getting the code page name.

RECERR RECAPIKRN	kRecGetCodePageInfo (LPCTSTR pCodePageName, LPTSTR pDesc, size_t size, LPOUTCODEPAGETYPE pCodePageType)
	Getting information about the code page.

RECERR RECAPIKRN	kRecCheckCodePage (int sid, LPWSTR pMissingChrs, size_t buflen)
	Checking the code page.

RECERR RECAPIKRN	kRecGetFirstCodePage (LPTSTR pCodePageName, size_t buflen)
	Starting enumeration of code pages.

RECERR RECAPIKRN	kRecGetNextCodePage (LPTSTR pCodePageName, size_t buflen)
	Performing enumeration of code pages.

RECERR RECAPIKRN	kRecConvertCodePage2Unicode (int sid, const LPBYTE pInput, size_t *pInputLen, LPWCH pUniCode)
	Converting from the current code page to 16-bit UNICODE.

RECERR RECAPIKRN	kRecConvertUnicode2CodePage (int sid, WCHAR UniCode, LPBYTE pOutput, size_t *pOutputLen)
	Converting from 16-bit UNICODE to the current code page.

RECERR RECAPIKRN	kRecConvertCodePage2UnicodeEx (int sid, const char pInput, size_t pInputLen, WCHAR *ppOutput, size_t pOutputCount, int flags)
	Converting a character or a string from a code page to 16-bit UNICODE.

RECERR RECAPIKRN	kRecConvertUnicode2CodePageEx (int sid, const WCHAR pInput, size_t pInputCount, char *ppOutput, size_t pOutputLen, int flags)
	Converting a 16-bit UNICODE character or string to a code page.

Codepage Conversion Mode flags for extended code page conversion
These flags describe the way how the extended code page conversion functions kRecConvertCodePage2UnicodeEx and kRecConvertUnicode2CodePageEx work. One of the CCM_CP_* flags is to be combined with one of the CCM_CONV_* flags. There is one pre-combined flag for the most frequently used case ( CCM_UTF8_STRING CCM_UTF8_STRING #define CCM_UTF8_STRING Combined flag for UTF-8 strings. Definition KernelApi.h:12842 ).
#define	CCM_CP_CURRENT 0x01
	Use the current code page for conversion.

#define	CCM_CP_UTF8 0x02
	Use the UTF-8 code page for conversion.

#define	CCM_CP_ANSI 0x03
	Use the ANSI code page (CP 1252) for conversion.

#define	CCM_CONV_CHAR 0x10
	Convert a single character only.

#define	CCM_CONV_STRING 0x20
	Convert the full zero terminated string.

#define	CCM_UTF8_STRING (CCM_CP_UTF8 \| CCM_CONV_STRING)
	Combined flag for UTF-8 strings.

Detailed Description

Language, Character Set and Code Page Handling Module.

This module handles language, character set and code page related settings and their combinations.

The processing language must be specified before calling any processing function on a page. You may define one or more languages with the kRecSetLanguages or kRecManageLanguages functions. The languages specify both the set of characters to recognize and - if spell checking is enabled - the dictionaries to use. If more than one language is enabled automatic language detection is done. Automatic detection has two working modes:

The default mode is suitable for few (around at most 5) languages and designed to work on pages which contain text in all of the enabled languages.
The Single Language Detection mode (enabled with the kRecSetSingleLanguageDetection function) is designed for unattended scanning or document input scenarios, where the language of arriving documents cannot be predicted.

CCJK and Arabic languages can be recognized one language at a time only (but English characters are automatically enabled), so only the second, Single Language Detection mode is supported when more than one CCJK languages and/or the Arabic one are enabled.

NOTE: Single Language Detection of Vietnamese, Thai and Hebrew languages are not supported. Very clean documents in Greek, Russian and other Cyrillic languages can be processed with Single Language Detection, but making your application depend on automatic detection of these languages is not encouraged.

The structure LANGUAGE_INFO provides information about a selected language and particular abbreviations of its name. This module supports the CSDK internal language codes and the following language code standards: ISO/DIS 639-1, ISO/DIS 639-2/B, ISO/DIS 639-3, BCP 47 and Windows 3-letter language codes. See the list of language identifiers for details.

In the ISO 639-3 standard there are languages missing. CSDK defines additional local identifiers for them as follows:

qsl	Serbian (Latin)
qbp	Brazilian
qes	Eskimo
qti	Pirez
qcs	Chinese (Simplified)
qct	Chinese (Traditional)

CSDK extends the ISO 639-3 standard with the following codes coming from ISO 639-2/B:

SMI	Sami languages

See the list of language identifiers for even more details about the supported standards and language identifiers.

Code pages

The current code page is specified by the setting Kernel.Chr.CodePage. Its default value is -1 meaning "Auto". Auto code page means the current code page comes from the setting Kernel.Chr.CodePage.Default. The default value of this latter setting on Windows is the code page of the current OS, on Linux and MacOS it is UTF-8. The kRecGetCodePage function can be used to learn this default value.

Current Windows OSs already support UTF-8 very well, so we suggest to use UTF-8 for all languages on all platforms. Use

kRecSetCodepage(sid, "UTF-8")

See also: Settings of the Character and Code Page Handling Module; Language, Character Set and Code page Handling Module related pages

Typedef Documentation

◆ LANG_ENA

typedef enum LANG_ENA LANG_ENA

Language enable/disable.

This defines the possible values for the language selection in the Language environment definition. This is supplied by the enum LANGUAGES and used by the function kRecSetLanguages.

Note: The Character Set is primarily defined by the Language environment specified by kRecSetLanguages. If no language is enabled, only the digits, language-independent punctuation and the miscellaneous characters are validated. Whenever at least one language is enabled (i.e. gets the LANG_ENABLED value), the unaccented lower and uppercase Latin-alphabet characters are validated in addition to the accented lower and uppercase letters defined for the enabled language(s). Language-specific punctuation is enabled with the language concerned. At present these only include the inverted question mark and inverted exclamation mark for Spanish.; To change the language selection, create a LANG_SIZE sized array with this LANG_ENA type. Initialize all elements to LANG_DISABLED, enable the required languages and pass the array to the kRecSetLanguages function.; To enable a single language only, use the kRecManageLanguages function. That function is useful in other cases as well; see its description.

◆ OUTCODEPAGETYPE

typedef enum OUTCODEPAGETYPE OUTCODEPAGETYPE

Code page types.

Each output code page is classified into one of these categories.

Enumeration Type Documentation

◆ BASIC_LANGUAGE_CHARSET

enum BASIC_LANGUAGE_CHARSET

Character set bases.

Basic character set types of languages. See LANGUAGE_INFO.

Enumerator
B_OTH	Internal use only.
B_BAS	Internal use only.
B_LAT	Latin alphabet-based language.
B_GRE	Greek language.
B_CYR	Cyrillic alphabet-based language.
B_ASN	An Asian language.
B_RTL	Right-to-left language (Arabic, Hebrew, ..).

◆ CHR_FILTER

enum CHR_FILTER

Recognition filters.

This enum lists available Character Set filter elements. Language environment can be narrowed down by specifying Character Set filters. The name of each filter element denotes the category of characters it validates. A filter is built from one or more filter elements by combining (binary OR-ing) them. There are five disjunct elements, five special ones and some pre-defined, combined ones. The union of the five disjunct elements covers the full set of supported characters. The special filter elements are user defined sets of characters. The filters can have an effect either at zone level (by specifying the zone's filter field) or globally, at page level (defined by the kRecSetDefaultFilter function). Use the FILTER_ALL value to set no filtering.

Note

Filters are not supported when an Eastern language (CCJK, Thai, Vietnamese, Arabic or Hebrew) is the current one. For those languages FILTER_ALL is the only supported filter. Nevertheless, the FILTER_WESTERN bit can be specified in some of the zones. It means that the given zone contains Western characters only, so that zone will be recognized by the Western OCR engine and Character Set filtering bits are supported in that zone.

Characters in the document or zone that are not part of the specified Character Set will either be UNICODE_REJECTED or recognized as a validated character with a similar shape. For instance, if you select English only and the document contains a letter "Capital A with acute", the recognized output may be the letter "Capital A".

Filters have no effect on the 2nd, 3rd etc. choices of each LETTER.

The capabilities of the selected recognition module can also impose restrictions, e.g. the HNR module is restricted to numerals and four other characters.

Not all recognition modules support all filter elements:

The ones that support all filter elements are: RM_OMNIFONT_MOR, RM_RER, RM_DOT, RM_OMNIFONT_FRX.
The ones that ignore all filter elements are: RM_BAR, RM_OMR.
The ones that support some filter elements are: RM_OMNIFONT_PLUS2W, RM_OMNIFONT_PLUS3W and RM_OMNIFONT_MTX (FILTER_ALL, FILTER_DIGIT and FILTER_ALPHA), RM_HNR (FILTER_ALL, FILTER_DIGIT, FILTER_PUNCTUATION and FILTER_MISCELLANEOUS).
While many recognition modules support some filters, machine print engines usually do not require them. Filters are best used for improving accuracy of the handprint recognition modules RM_RER and RM_HNR.

To add FILTER_PLUS characters to the Character Set defined by the language environment, the filter value should be: FILTER_ALL | FILTER_PLUS.

To add FILTER_PLUS characters to the filtered Character Set, place FILTER_PLUS along with the other required filters. For example, to enable only digits and FILTER_PLUS characters, use: FILTER_DIGIT | FILTER_PLUS.

To validate FILTER_PLUS characters only, FILTER_PLUS must be the only filter element in the zone structure field. This even prevents language selection from validating letters in the current zone.

There are four different sets of FILTER_PLUS characters; the above comments apply to all of them. Use kRecSetFilterPlusEx function with the correct index to specify the set of characters to use with them.

For example a possible use case could be that there are three different zone types: alpha only, numeric and mixed. Use FILTER_ALPHA for the alpha only fields, FILTER_DIGIT | FILTER_PLUS for the numeric fields where the 0-indexed FilterPlus string may be specified as ".,-/", and use FILTER_ALPHA | FILTER_DIGIT | FILTER_PLUS_1 for the mixed fields where the 1-indexed FilterPlus string may be specified as ".,-/#$%()". (Of course the sets should be specified according to your needs.)

Enumerator
FILTER_DEFAULT	Use this value to have the zone handled globally. Do not combine this with any other filters.
FILTER_DIGIT	[Disjunct filter] Recognition of numerals only. E.g.: "3" (Digit Three).
FILTER_UPPERCASE	[Disjunct filter] Recognition of uppercase letters only, including accented ones. E.g.: "A" (Capital A).
FILTER_LOWERCASE	[Disjunct filter] Recognition of lowercase letters only, including accented ones. E.g.: "a" (Lowercase a).
FILTER_PUNCTUATION	[Disjunct filter] Recognition of punctuation signs only. E.g.: "!" (Exclamation Mark).
FILTER_MISCELLANEOUS	[Disjunct filter] Recognition of other miscellaneous characters only. E.g.: "+" (Plus sign).
FILTER_PLUS	[Special, combinable filter] Enables the use of the first set of FilterPlus characters specified by the kRecSetFilterPlus or kRecSetFilterPlusEx functions (index 0). The FilterPlus characters are added after any kind of filtering.
FILTER_PLUS_1	[Special, combinable filter] Enables the use of the second set of FilterPlus characters (index 1).
FILTER_PLUS_2	[Special, combinable filter] Enables the use of the third set of FilterPlus characters (index 2).
FILTER_PLUS_3	[Special, combinable filter] Enables the use of the fourth set of FilterPlus characters (index 3).
FILTER_USER_DICT	[Special, combinable filter] Recognition of characters from the user dictionary.
FILTER_WESTERN	[Special, combinable filter] Enables above western filtering rules to work in the zone when the current language is an Asian one. (No Asian characters are to be recognized in such zones.)
FILTER_ALL	[Pre-defined combined filter] Since all elements are enabled, there is no filtering.
FILTER_ALPHA	[Pre-defined combined filter] Recognition of upper and lowercase letters only.
FILTER_NUMBERS	[Pre-defined combined filter] Recognition of digits and the FilterPlus characters set by the kRecSetFilterPlus function.
FILTER_SIZE	Number of possible combinations of the disjunct filters.

◆ CONTINENT

enum CONTINENT

Continent ID.

This enum can be used for identifying the geographical location, where a given language is spoken. See LANGUAGE_INFO.

Enumerator
C_EUROPE	Europe
C_ASIA	Asia
C_AFRICA	Africa
C_OCEANIA	Australia and Oceania
C_LAMERICA	Latin America
C_NAMERICA	North America
C_INTERNATIONAL	Artificial languages and Latin

◆ LANG_ENA

enum LANG_ENA

Language enable/disable.

This defines the possible values for the language selection in the Language environment definition. This is supplied by the enum LANGUAGES and used by the function kRecSetLanguages.

Note: The Character Set is primarily defined by the Language environment specified by kRecSetLanguages. If no language is enabled, only the digits, language-independent punctuation and the miscellaneous characters are validated. Whenever at least one language is enabled (i.e. gets the LANG_ENABLED value), the unaccented lower and uppercase Latin-alphabet characters are validated in addition to the accented lower and uppercase letters defined for the enabled language(s). Language-specific punctuation is enabled with the language concerned. At present these only include the inverted question mark and inverted exclamation mark for Spanish.; To change the language selection, create a LANG_SIZE sized array with this LANG_ENA type. Initialize all elements to LANG_DISABLED, enable the required languages and pass the array to the kRecSetLanguages function.; To enable a single language only, use the kRecManageLanguages function. That function is useful in other cases as well; see its description.

Enumerator
LANG_DISABLED	The particular language is disabled.
LANG_ENABLED	The particular language is enabled.

◆ LANGUAGE_CODE

enum LANGUAGE_CODE

Language code type.

One of these values can be used with the kRecFindLanguageEx function to specify the type of the language name abbreviation code to search.

Enumerator
LANGCODE_ALL	Look for the language using all supported language codes.
LANGCODE_ENGLISH	Look for the language as an English name.
LANGCODE_INT_3	Look for the language among the CSDK internal 3-letter codes.
LANGCODE_639_1	Look for the language in the ISO/DIS 639-1 standard.
LANGCODE_639_2B	Look for the language in the ISO/DIS 639-2/B standard.
LANGCODE_639_3	Look for the language in the superset of the ISO/DIS 639-3 standard.
LANGCODE_WIN_3	Look for the language among the Windows Three Letter Acronyms.
LANGCODE_BCP_47	Look for the language in the BCP 47 standard.

◆ LANGUAGES

enum LANGUAGES

Possible languages.

This enum identifies the different languages supported directly by the Engine. In the Engine these languages are used in two different places:

For recognition: they define the available languages to form the Language environment of the Character Set.
A subset of these languages can be made available to the checking module. In this case a language (the Spelling language) should be specified.
Note
To define the Language environment of the Character Set, these languages are to be specified as indices to a LANG_ENA array where the selection/de-selection of a language means setting the proper element of the array to LANG_ENA.LANG_ENABLED / LANG_ENA.LANG_DISABLED. Recognition modules, especially the omnifont ones (RM_OMNIFONT_MTX and RM_OMNIFONT_MOR), support the recognition of different accented letters of the languages enumerated here.

Languages not listed here individually are supported for recognition either by combining the available languages and/or by specifying individually validated characters (the LanguagesPlus characters - kRecSetLanguagesPlus) in addition to those defined by the language selection.

When this enum is used for specifying the Spelling language, you can only use languages supported by the current engine configuration (and delivered along with the integrating application). You can also use the LANG_AUTO and the LANG_NO values. (The OmniPage Capture SDK is delivered with support for 21 different spell languages. However, the distribution set of the integrating application may contain support for fewer dictionary languages.)

Specifying two or more languages for recognition results in validation of a combined (OR-ed) set of the characters of these languages. While most of the modules allow any combination of languages, the RM_OMNIFONT_FRX module supports language combinations only within the same Code Page. For example, this module properly processes the English, German and Italian language combinations, since all these languages belong to the Windows ANSI (1252) Code Page. However, when specifying e.g. both the French and Czech languages, RM_OMNIFONT_FRX may fail to recognize some accented characters properly in the Czech alphabet, since these languages are not in the same Code Page.

Enumerator
LANG_ALL	Use with kRecManageLanguages only! See details there.
LANG_ALL_LATIN	Use with kRecManageLanguages only! See details there.
LANG_ALL_ASIAN	Use with kRecManageLanguages only! See details there.
LANG_START	First 'Special' language ID
LANG_UD	User dictionary
LANG_AUTO	Automatic spell checking language selection. Use with kRecSetSpellLanguage only! See details there. (Default for spell checking)
LANG_NO	No spell checking language selection. Use with kRecSetSpellLanguage only! See details there.
LANG_ENG	English language selection. Spelling supported! (Default for recognition). ISO/DIS 639-3 code is 'eng'.
LANG_GER	German language selection. Spelling supported! ISO/DIS 639-3 code is 'deu'.
LANG_FRE	French language selection. Spelling supported! ISO/DIS 639-3 code is 'fra'.
LANG_DUT	Dutch language selection. Spelling supported! ISO/DIS 639-3 code is 'nld'.
LANG_NOR	Norwegian language selection. Spelling supported! ISO/DIS 639-3 code is 'nor'.
LANG_SWE	Swedish language selection. Spelling supported! ISO/DIS 639-3 code is 'swe'.
LANG_FIN	Finnish language selection. Spelling supported! ISO/DIS 639-3 code is 'fin'.
LANG_DAN	Danish language selection. Spelling supported! ISO/DIS 639-3 code is 'dan'.
LANG_ICE	Icelandic language selection. ISO/DIS 639-3 code is 'isl'.
LANG_POR	Portuguese language selection. Spelling supported! ISO/DIS 639-3 code is 'por'.
LANG_SPA	Spanish language selection. Spelling supported! ISO/DIS 639-3 code is 'spa'.
LANG_CAT	Catalan language selection. Spelling supported! ISO/DIS 639-3 code is 'cat'.
LANG_GAL	Galician language selection. Alternate names are Gallegan and Gallego. Spoken in Spain and Portugal. ISO/DIS 639-3 code is 'glg'.
LANG_ITA	Italian language selection. Spelling supported! ISO/DIS 639-3 code is 'ita'.
LANG_MAL	Maltese language selection. ISO/DIS 639-3 code is 'mlt'.
LANG_GRE	Greek language selection. This selection includes the characters of the English language, as well. Spelling supported! ISO/DIS 639-3 code is 'ell'.
LANG_POL	Polish language selection. Spelling supported! ISO/DIS 639-3 code is 'pol'.
LANG_CZH	Czech language selection. Spelling supported! ISO/DIS 639-3 code is 'ces'.
LANG_SLK	Slovak language selection. ISO/DIS 639-3 code is 'slk'.
LANG_HUN	Hungarian language selection. Spelling supported! ISO/DIS 639-3 code is 'hun'.
LANG_SLN	Slovenian language selection. Spelling supported! ISO/DIS 639-3 code is 'slv'.
LANG_CRO	Croatian language selection. ISO/DIS 639-3 code is 'hrv'.
LANG_ROM	Romanian language selection. ISO/DIS 639-3 code is 'ron'.
LANG_ALB	Albanian language selection. ISO/DIS 639-3 code is 'sqi'.
LANG_TUR	Turkish language selection. Spelling supported! ISO/DIS 639-3 code is 'tur'.
LANG_EST	Estonian language selection. ISO/DIS 639-3 code is 'est'.
LANG_LAT	Latvian language selection. ISO/DIS 639-3 code is 'lav'.
LANG_LIT	Lithuanian language selection. ISO/DIS 639-3 code is 'lit'.
LANG_ESP	Esperanto language selection. Constructed language. Spelling supported! ISO/DIS 639-3 code is 'epo'.
LANG_SRL	Serbian (Latin) language selection. The Serbian language's ISO/DIS 639-3 code is 'srp', but the CSDK uses the 'qsl' local code for Latin Serbian writing.
LANG_SRB	Serbian (Cyrillic) language selection. This selection includes the characters of the English language, as well. The Serbian language's ISO/DIS 639-3 code is 'srp'. The CSDK uses this code for only the Cyrillic Serbian writing.
LANG_MAC	Macedonian (Cyrillic) language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'mkd'.
LANG_MOL	Moldavian (Cyrillic) language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'mol'.
LANG_BUL	Bulgarian (Cyrillic) language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'bul'.
LANG_BEL	Byelorussian (Cyrillic) language selection. This selection includes the characters of the English language, as well. Other spellings Belarusian and White Russian. ISO/DIS 639-3 code is 'bel'.
LANG_UKR	Ukrainian (Cyrillic) language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'ukr'.
LANG_RUS	Russian (Cyrillic) language selection. This selection includes the characters of the English language, as well. Spelling supported! ISO/DIS 639-3 code is 'rus'.
LANG_CHE	Chechen language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'che'.
LANG_KAB	Kabardian language selection. This selection includes the characters of the English language, as well. Alternate name is Beslenei. Spoken in Russia and Turkey. ISO/DIS 639-3 code is 'kbd'.
LANG_AFR	Afrikaans language selection. Spoken in Sourth Africa. ISO/DIS 639-3 code is 'afr'.
LANG_AYM	Aymara language selection. Spoken in Bolivia and Peru. ISO/DIS 639-3 code is 'aym'.
LANG_BAS	Basque language selection. ISO/DIS 639-3 code is 'eus'.
LANG_BEM	Bemba language selection. Alternate names are Chibemba, Ichibemba, Wemba, Chiwemba. Spoken in Zambia and Democratic Republic of Congo. ISO/DIS 639-3 code is 'bem'.
LANG_BLA	Blackfoot language selection. Alternate name is Blackfeet, Siksika and Pikanii. Spoken in Canada and USA. ISO/DIS 639-3 code is 'bla'.
LANG_BRE	Breton language selection. ISO/DIS 639-3 code is 'bre'.
LANG_BRA	Portuguese (Brazilian) language selection. Spelling supported! There is no language code for the Brazilian Portuguese language in the ISO/DIS 639-3 standard. The CSDK uses the 'qbp' local code.
LANG_BUG	Bugotu language selection. Spoken in Solomon Islands. ISO/DIS 639-3 code is 'bgt'.
LANG_CHA	Chamorro language selection. Spoken in Guam and Northern Mariana Islands. ISO/DIS 639-3 code is 'cha'.
LANG_CHU	Chuana or Tswana language selection. Spoken in Botswana and South Africa. ISO/DIS 639-3 code is 'tsn'.
LANG_COR	Corsican language selection. ISO/DIS 639-3 code is 'cos'.
LANG_CRW	Crow language selection. Spoken in USA. ISO/DIS 639-3 code is 'cro'.
LANG_ESK	Eskimo language selection. This language selection is a collection of Eskimo and Inuit languages. There is no language code for it in the ISO/DIS 639-3 standard. The CSDK uses the 'qes' local code.
LANG_FAR	Faroese language selection. ISO/DIS 639-3 code is 'fao'.
LANG_FIJ	Fijian language selection. ISO/DIS 639-3 code is 'fij'.
LANG_FRI	Frisian language selection. This is a macro language of three Frisian languages in Germany. ISO/DIS 639-3 code is 'fry'.
LANG_FRU	Friulian language selection. Spoken in Italy. ISO/DIS 639-3 code is 'fur'.
LANG_GLI	Gaelic Irish language selection. ISO/DIS 639-3 code is 'gle'.
LANG_GLS	Gaelic Scottish language selection. ISO/DIS 639-3 code is 'gla'.
LANG_GAN	Ganda or Luganda language selection. Spoken in Uganda. ISO/DIS 639-3 code is 'lug'.
LANG_GUA	Guarani language selection. This is a macro language of the Chiripa and some Guarani languages. Spoken in Paraguay, Argentina, Bolivia and Brazil. ISO/DIS 639-3 code is 'grn'.
LANG_HAN	Hani language selection. Alternate names are Hanhi, Haw and Hani Proper. Spoken in China, Laos and Viet Nam. ISO/DIS 639-3 code is 'hni'.
LANG_HAW	Hawaiian language selection. ISO/DIS 639-3 code is 'haw'.
LANG_IDO	Ido language selection. Constructed language. ISO/DIS 639-3 code is 'ido'.
LANG_IND	Indonesian language selection. ISO/DIS 639-3 code is 'ind'.
LANG_INT	Interlingua language selection. Constructed language. ISO/DIS 639-3 code is 'ina'.
LANG_KAS	Kashubian language selection. Spoken in Poland. ISO/DIS 639-3 code is 'csb'.
LANG_KAW	Kawa language selection. Alternate names area Wa, Va, Vo, Wa Pwo and Wakut. Spoken in China. ISO/DIS 639-3 code is 'wbm'.
LANG_KIK	Kikuyu language selection. Spoken in Kenya. ISO/DIS 639-3 code is 'kik'.
LANG_KON	Kongo language selection. This is a macro language of Laari and Kongo languages. Spoken in the Democratic Republic of the Congo, Angola and Congo. ISO/DIS 639-3 code is 'kon'.
LANG_KPE	Kpelle language selection. This is a macro language of Kpelle languages. Spoken in Liberia and Guinea. ISO/DIS 639-3 code is 'kpe'.
LANG_KUR	Kurdish language selection - if written in the latin alphabet. This is a macro language of the Kurdish languages ISO/DIS 639-3 code is 'kur'.
LANG_LTN	Latin language selection. ISO/DIS 639-3 code is 'lat'.
LANG_LUB	Luba language selection. Alternate names are Luba-Lulua, Luba-Kasai, Tshiluba, Luva and Western Luba. Spoken in the Democratic Republic of the Congo. ISO/DIS 639-3 code is 'lua'.
LANG_LUX	Luxembourgish language selection. Alternate names are Luxembourgeois and Letzburgish. Spoken in Luxembourg. ISO/DIS 639-3 code is 'ltz'.
LANG_MLG	Malagasy language selection. This is a macro language of Malagasy languages. Spoken in Madagascar. ISO/DIS 639-3 code is 'mlg'.
LANG_MLY	Malay language selection. ISO/DIS 639-3 code is 'msa'.
LANG_MLN	Malinke language selection. Alternate names are Western Maninkakan, Malinka and Maninga. Spoken in Senegal, Gambia and Mali. ISO/DIS 639-3 code is 'mlq'.
LANG_MAO	Maori language selection. Spoken in New Zealand. ISO/DIS 639-3 code is 'mri'.
LANG_MAY	Mayan language selection. This is a language collection which is not supported by ISO/DIS 639-3, so the CSDK uses the ISO/DSI-639-2 code for this: 'MYN'.
LANG_MIA	Miao language selection. This is a macro language of Hmong languages. Alternate name is Hmong. Spoken in China, Laos, Thailand, Myanmar and Viet Nam. ISO/DIS 639-3 code is 'hmn'.
LANG_MIN	Minangkabau language selection. ISO/DIS 639-3 code is 'min'.
LANG_MOH	Mohawk language selection. Spoken in Canada and USA. ISO/DIS 639-3 code is 'moh'.
LANG_NAH	Nahuatl language selection. This is a language collection which is not supported by ISO/DIS 639-3, so the CSDK uses the ISO/DSI-639-2 code for this: 'NAH'.
LANG_NYA	Nyanja language selection. Alternate names are Chichewa and Chinyanja. Spoken in Malawi, Mozambique, Zambia and Zimbabw. ISO/DIS 639-3 code is 'nya'.
LANG_OCC	Occidental language selection. Constructed language. ISO/DIS 639-3 code is 'occ'.
LANG_OJI	Ojibway language selection. This is a macro language of Ojibwa, Chippewa and Ottawa languages. Alternate names are Ojibwa and Ojibwe. Spoken in Canada and USA. ISO/DIS 639-3 code is 'oji'.
LANG_PAP	Papiamento language selection. Spoken in Netherlands Antilles, Aruba. ISO/DIS 639-3 code is 'pap'.
LANG_PID	Pidgin English language selection. Alternate names are Tok Pisin, Naomalanesian and New Guinean Pidgin English. Spoken in Papua New Guinea. ISO/DIS 639-3 code is 'tpi'.
LANG_PRO	Provencal language selection. Alternate name is Occitan. Spoken in France, Italy and Monaco. ISO/DIS 639-3 code is 'prv'.
LANG_QUE	Quechua language selection. This is a macro language of the Quechua languages. Spoken in Peru. ISO/DIS 639-3 code is 'que'.
LANG_RHA	Rhaetic language selection. Alternate names are Romansch and Rhaeto-Romance. Spoken in Switzerland. ISO/DIS 639-3 code is 'roh'.
LANG_ROY	Romany language selection. Spoken all over Europe. ISO/DIS 639-3 code is 'rom'.
LANG_RUA	Rwanda language selection. Alternate names are Kinyarwanda and Rwanda. Spoken in Rwanda, the Democratic Republic of Congo and Uganda. ISO/DIS 639-3 code is 'kin'.
LANG_RUN	Rundi language selection. Spoken in Burundi and Uganda. ISO/DIS 639-3 code is 'run'.
LANG_SAM	Samoan language selection. Spoken in Samoa and American Samoa. ISO/DIS 639-3 code is 'smo'.
LANG_SAR	Sardinian language selection. This is a macro language of the Sardinian languages. ISO/DIS 639-3 code is 'srd'.
LANG_SHO	Shona language selection. Spoken in Zimbabwe, Botswana and Zambia. ISO/DIS 639-3 code is 'sna'.
LANG_SIO	Sioux language selection. Alternate name is Dakota. Spoken in USA and Canada. ISO/DIS 639-3 code is 'dak'.
LANG_SMI	Sami language selection (Combination of the Sami language family). This is a language collection which is not supported by ISO/DIS 639-3, so the CSDK uses the ISO/DSI-639-2 code for this: 'SMI'.
LANG_SML	Lule Sami language selection. ISO/DIS 639-3 code is 'smj'.
LANG_SMN	Northern Sami language selection. ISO/DIS 639-3 code is 'sme'.
LANG_SMS	Southern Sami language selection. ISO/DIS 639-3 code is 'sma'.
LANG_SOM	Somali language selection. ISO/DIS 639-3 code is 'som'.
LANG_SOT	Sotho, Suto or Sesuto language selection. Spoken is Lesotho and South Africa. ISO/DIS 639-3 code is 'sot'.
LANG_SUN	Sundanese language selection. Alternate names are Sunda and Priangan. Spoken in Java and Bali in Indonesia. ISO/DIS 639-3 code is 'sun'.
LANG_SWA	Swahili language selection. This is a macro language of the Swahili languages. Spoken in the Democratic Republic of the Congo, Tanzania, Kenya and Somalia. ISO/DIS 639-3 code is 'swa'.
LANG_SWZ	Swazi language selection. Alternate names are Swati, Siswati and Tekela. Spoken in Swaziland, Lesotho, Mozambique and South Africa. ISO/DIS 639-3 code is 'ssw'.
LANG_TAG	Tagalog language selection. Spoken in Philippines. ISO/DIS 639-3 code is 'tgl'.
LANG_TAH	Tahitian language selection. ISO/DIS 639-3 code is 'tah'.
LANG_TIN	Pirez language selection. There is no language code for it in the ISO/DIS 639-3 standard. The CSDK uses the 'qti' local code.
LANG_TON	Tongan language selection. Alternate names are Tonga, Siska and Nyasa. Spoken in Malawi. ISO/DIS 639-3 code is 'ton'.
LANG_TUN	Tun language selection. Alternate names are Tunia and Tunya. Spoken in Chad. ISO/DIS 639-3 code is 'tug'.
LANG_VIS	Visayan language selection. The Visayan language actually consists of three languages: Cebuano, Hiligaynon and Samaran or Waray-waray. Spoken in the Philippines. There is no language code for it in the ISO/DIS 639-3 standard. The CSDK uses the 'qis' local code.
LANG_WEL	Welsh language selection. ISO/DIS 639-3 code is 'cym'.
LANG_WEN	Wend or Sorbian language selection. This is a language collection which is not supported by ISO/DIS 639-3, so the CSDK uses the ISO/DSI-639-2 code for this: 'WEN'.
LANG_WOL	Wolof language selection. Spoken in Senegal and Mauritania. ISO/DIS 639-3 code is 'wol'.
LANG_XHO	Xhosa language selection. Spoken in South Africa and Lesotho. ISO/DIS 639-3 code is 'xho'.
LANG_ZAP	Zapotec language selection. This is a macro language of the Zapotec languages. Spoken in Mexico. ISO/DIS 639-3 code is 'zap'.
LANG_ZUL	Zulu language selection. Spoken in South Africa, Lesotho, Malawi, Mozambique and Swaziland. ISO/DIS 639-3 code is 'zul'.
LANG_JPN	Japanese language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'jpn'.
LANG_CHS	Simplified Chinese language selection. This selection includes the characters of the English language, as well. There is no language code for this writing mode in the ISO/DIS 639-3 standard. The CSDK uses the 'qcs' local code.
LANG_CHT	Traditional Chinese language selection. This selection includes the characters of the English language, as well. There is no language code for this writing mode in the ISO/DIS 639-3 standard. The CSDK uses the 'qct' local code.
LANG_KRN	Korean language selection. This selection includes the characters of the English language, as well. ISO/DIS 639-3 code is 'kor'.
LANG_THA	Thai language selection. ISO/DIS 639-3 code is 'tha'. This language is supported on: Windows, Linux, Embedded Linux and MacOS x64.
LANG_ARA	Arabic language selection. ISO/DIS 639-3 code is 'ara'. This language is supported on: Windows, Linux, Embedded Linux, MacOS.
LANG_HEB	Hebrew language selection. ISO/DIS 639-3 code is 'heb'. This language is supported on: Windows, Linux, Embedded Linux and MacOS x64.
LANG_VIE	Vietnamese (Latin) language selection. ISO/DIS 639-3 code is 'vie'. This language is supported on: Windows, Linux, Embedded Linux and MacOS x64.
LANG_SIZE	Number of directly selectable languages.

◆ MANAGE_LANG

enum MANAGE_LANG

Language management actions.

This enum defines the possible management actions for the kRecManageLanguages function.

Enumerator
SET_LANG	Make the given language the single enabled language in the set.
ADD_LANG	Add the given language to the set of enabled languages.
REMOVE_LANG	Remove the given language from the set of enabled languages.
INVERT_LANG	Invert the status of the given language in the set of enabled languages.
IS_LANG_ENABLED	Return REC_OK if the given language is enabled.

◆ OUTCODEPAGETYPE

enum OUTCODEPAGETYPE

Code page types.

Each output code page is classified into one of these categories.

Enumerator
CODEP_UNKNOWN	Code Pages not classified to any further categories, e.g. Roman-8.
SPECIFIC	16-bit Code Pages, e.g. UNICODE.
ASCII_BASED	Code Pages typically used by different localized DOS versions and DOS applications, e.g. Code Page 437.
ANSI_BASED	Code Pages typically used by different localized Windows versions and Windows applications, e.g. Windows Greek.
MAC_BASED	Code Pages typically used by different localized MacOS versions and applications, e.g. Macintosh Central European CP.
INTERNAL_CP	Reserved 8-bit and 16 bit Code Pages used internally by the Engine.
ASIAN_CODEPAGE	Multi-byte code pages for Asian languages.

◆ RM_FLAGS

enum RM_FLAGS

Recognition Engines supporting a language.

These flags can be used to indicate the set of recognition engines supporting a given language. See LANGUAGE_INFO.

Function Documentation

◆ kRecCheckCodePage()

RECERR RECAPIKRN kRecCheckCodePage	(	int	sid,
		LPWSTR	pMissingChrs,
		size_t	buflen )

Checking the code page.

The kRecCheckCodePage checks whether the current Code Page setting contains all the characters of the current Language environment (language selection, the LanguagesPlus characters), and any characters listed as FilterPlus characters.

Parameters

[in]	sid	Settings Collection ID.
[out]	pMissingChrs	Pointer of a buffer to hold any missing characters returned by the function.
[in]	buflen	Specifies the size of the buffer in bytes. It must be large enough to hold all the characters and the terminating double zero.

Return values

RECERR

Note: The buffer will contain any character (in 16-bit UNICODE) validated for recognition, which cannot be found in the current Code Page.; The specification of this function in C# is:
RECERR kRecCheckCodePage(int sid, StringBuilder pMissingChrs);

kRecCheckCodePage
RECERR RECAPIKRN kRecCheckCodePage(int sid, LPWSTR pMissingChrs, size_t buflen)
Checking the code page.

RECERR
RECERR
Error codes.
Definition RECERR_doc.h:19; The specification of this function in Java is:
int kRecCheckCodePage(int sid, String[] pMissingChrs); The specification of this function in Python is:
def kRecCheckCodePage(sid: int) -> Tuple[int, str]

◆ kRecConvertCodePage2Unicode()

RECERR RECAPIKRN kRecConvertCodePage2Unicode	(	int	sid,
		const LPBYTE	pInput,
		size_t *	pInputLen,
		LPWCH	pUniCode )

Converting from the current code page to 16-bit UNICODE.

This utility function converts a single character code from the current Code Page value to its 16-bit UNICODE representation.

Parameters

[in]	sid	Settings Collection ID.
[in]	pInput	Pointer to the input buffer containing the character code to be converted.
[in,out]	pInputLen	Pointer to a variable containing the length of the input buffer. Upon returning, the variable gets the exact length of the converted code.
[out]	pUniCode	Pointer of a variable to store the result of the conversion.

Return values

RECERR

Note: This function can be useful whenever a character or a character string should be passed to a RecAPI API function requiring the character's or string's 16-bit UNICODE representation. (e.g. kRecSetFilterPlus).; The current Code Page can be changed by a previous kRecSetCodePage function call.; For characters that are illegal in the given code page this function returns with CHR_ILLEGALCODE_ERR error code.; The function checks whether the passed length is enough for representing the input character. If it is not enough, the function returns with CHR_CODELENGTH_ERR error. If it is enough, the length parameter gets the exact length of the specified input code. This is useful when a whole word or longer string is converted character-by-character, because this retrieved exact length gives the bytes to be skipped for the next character of the input string.; The specification of this function in C# is:
RECERR kRecConvertCodePage2Unicode(int sid, byte[] pInput, out int pInputLen, out char pUniCode);

kRecConvertCodePage2Unicode
RECERR RECAPIKRN kRecConvertCodePage2Unicode(int sid, const LPBYTE pInput, size_t *pInputLen, LPWCH pUniCode)
Converting from the current code page to 16-bit UNICODE.

◆ kRecConvertCodePage2UnicodeEx()

RECERR RECAPIKRN kRecConvertCodePage2UnicodeEx	(	int	sid,
		const char *	pInput,
		size_t *	pInputLen,
		WCHAR **	ppOutput,
		size_t *	pOutputCount,
		int	flags )

Converting a character or a string from a code page to 16-bit UNICODE.

This utility function converts either a single character or a string from UTF-8, ANSI or the current Code Page to a 16-bit UNICODE character or string.

Parameters

[in]	sid	Settings Collection ID.
[in]	pInput	Pointer to the input buffer containing the character code or the string to be converted.
[in,out]	pInputLen	Pointer to a variable containing the length of the input buffer. Upon returning, the variable gets the exact length of the converted input codes. This parameter can be NULL when a zero terminated string is to be converted.
[out]	ppOutput	Address of a WCHAR* variable that will be filled with the pointer to the allocated output buffer having the result of conversion.
[out]	pOutputCount	Address of a variable that will be filled with the number of converted WCHAR characters. This parameter can be NULL.
[in]	flags	Codepage Conversion Mode flags (CCM_*) describing conversion ANCH_CCMFLAGS details.

Return values

RECERR

Note: This function can be useful whenever a character or a character string should be passed to a RecAPI API function requiring the character's or string's 16-bit UNICODE representation. (e.g. kRecSetFilterPlus).; This function is an extension of the kRecConvertCodePage2Unicode function: this extended one supports string conversion, too.
kRecConvertCodePage2Unicode

is functionally equivalent to
kRecConvertCodePage2UnicodeEx(sid, pInput, pInputLen, &pUniCode, NULL, CCM_CP_CURRENT | CCM_CONV_CHAR)

CCM_CP_CURRENT
#define CCM_CP_CURRENT
Use the current code page for conversion.
Definition KernelApi.h:12837

CCM_CONV_CHAR
#define CCM_CONV_CHAR
Convert a single character only.
Definition KernelApi.h:12840

kRecConvertCodePage2UnicodeEx
RECERR RECAPIKRN kRecConvertCodePage2UnicodeEx(int sid, const char *pInput, size_t *pInputLen, WCHAR **ppOutput, size_t *pOutputCount, int flags)
Converting a character or a string from a code page to 16-bit UNICODE.

. See that function's description also.; The input character or string is coded either in UTF-8, or in the currently active Code Page. This must be specified in the flags parameter with the
CCM_CP_UTF8

CCM_CP_UTF8
#define CCM_CP_UTF8
Use the UTF-8 code page for conversion.
Definition KernelApi.h:12838

,
CCM_CP_ANSI

CCM_CP_ANSI
#define CCM_CP_ANSI
Use the ANSI code page (CP 1252) for conversion.
Definition KernelApi.h:12839

or
CCM_CP_CURRENT

values. (The current Code Page can be changed by a previous kRecSetCodePage function call.); Another flag bit specifies whether a multi-character string, or just a single character is to be converted:
CCM_CONV_STRING

CCM_CONV_STRING
#define CCM_CONV_STRING
Convert the full zero terminated string.
Definition KernelApi.h:12841

or
CCM_CONV_CHAR

. The conversion stops after a zero zode. When the input is a zero terminated string, the input buffer's length (the
pInputLen

parameter) can be NULL.; When the input buffer is an UTF-8 string, the pre-combined
CCM_UTF8_STRING

value can be used as the
flags

parameter.; The function returns the converted character or string in an allocated buffer (see
ppOutput

). This buffer must be released using kRecFree when not needed.; When a zero terminated string is successfully converted, the returned character count includes the zero termination, too.

◆ kRecConvertUnicode2CodePage()

RECERR RECAPIKRN kRecConvertUnicode2CodePage	(	int	sid,
		WCHAR	UniCode,
		LPBYTE	pOutput,
		size_t *	pOutputLen )

Converting from 16-bit UNICODE to the current code page.

This utility function converts a 16-bit UNICODE character code to its representation in the current code page.

Parameters

[in]	sid	Settings Collection ID.
[in]	UniCode	Character code to be converted.
[out]	pOutput	Pointer to the output buffer to store the result of conversion. This parameter can be NULL when the required length of the output buffer is queried only.
[in,out]	pOutputLen	Pointer to a variable containing the allocated length of the output buffer. Upon returning, the variable gets the exact length of the converted output code.

Return values

RECERR

Note: The current Code Page can be changed by a previous kRecSetCodePage function call.; If pOutput is NULL the function gives the required length for storing the output code and the return value is CHR_CODELENGTH_ERR.; For illegal 16-bit UNICODE codes this function returns with CHR_ILLEGALCODE_ERR error code.; For 16-bit UNICODE codes not in the given code page, this function returns with CHR_MISSINGEXPORT_ERR error code and retrieves the missing symbol set by kRecSetMissingSymbol.; If the input is UNICODE_REJECTED it retrieves the rejected symbol set by kRecSetRejectionSymbol. If the input is UNICODE_MISSING it retrieves the missing symbol set by kRecSetMissingSymbol.; The specification of this function in C# is:
RECERR kRecConvertUnicode2CodePage(int sid, char UniCode, out byte[] pExport);

kRecConvertUnicode2CodePage
RECERR RECAPIKRN kRecConvertUnicode2CodePage(int sid, WCHAR UniCode, LPBYTE pOutput, size_t *pOutputLen)
Converting from 16-bit UNICODE to the current code page.

◆ kRecConvertUnicode2CodePageEx()

RECERR RECAPIKRN kRecConvertUnicode2CodePageEx	(	int	sid,
		const WCHAR *	pInput,
		size_t *	pInputCount,
		char **	ppOutput,
		size_t *	pOutputLen,
		int	flags )

Converting a 16-bit UNICODE character or string to a code page.

This utility function converts either a single 16-bit UNICODE character or a UNICODE string to its representation in UTF-8, ANSI or the current code page.

Parameters

[in]	sid	Settings Collection ID.
[in]	pInput	Pointer to the input buffer containing the 16-bit UNICODE character code or UNICODE string to be converted.
[in,out]	pInputCount	Pointer to a variable containing the number of WCHARs in the input buffer. Upon returning, the variable gets the exact number of the converted WCHARs. This parameter can be NULL when a single WCHAR or a zero terminated string of them is to be converted.
[out]	ppOutput	Address of a char* variable that will be filled with the pointer to the allocated output buffer having the result of conversion.
[out]	pOutputLen	Address of a variable that will be filled with the length of converted characters. This parameter can be NULL.
[in]	flags	Codepage Conversion Mode flags (CCM_*) describing conversion ANCH_CCMFLAGS details.

Return values

RECERR

Note: This function is an extension of the kRecConvertUnicode2CodePage function: this extended one supports string conversion, too.
kRecConvertUnicode2CodePage

is functionally equivalent to
kRecConvertUnicode2CodePageEx(sid, &Unicode, NULL, &pOutput, pOutputLen, CCM_CP_CURRENT | CCM_CONV_CHAR)

kRecConvertUnicode2CodePageEx
RECERR RECAPIKRN kRecConvertUnicode2CodePageEx(int sid, const WCHAR *pInput, size_t *pInputCount, char **ppOutput, size_t *pOutputLen, int flags)
Converting a 16-bit UNICODE character or string to a code page.

. See that function's description also.; The output character or string is coded either in UTF-8, ANSI or the currently active Code Page. This must be specified in the flags parameter with the
CCM_CP_UTF8

,
CCM_CP_ANSI

or
CCM_CP_CURRENT

values. (The current Code Page can be changed by a previous kRecSetCodePage function call.); Another flag bit specifies whether a multi-character string, or just a single character is to be converted:
CCM_CONV_STRING

or
CCM_CONV_CHAR

. The conversion stops after a zero zode. When the input is a zero terminated string, the input buffer's length (the
pInputCount

parameter) can be NULL. It can be NULL also when a single WCHAR is to be converted.; When the input is a string and the output will be an UTF-8 string, the pre-combined
CCM_UTF8_STRING

value can be used as the
flags

parameter.; The function returns the converted character or string in an allocated buffer (see
ppOutput

). This buffer must be released using kRecFree when not needed.; When a zero terminated string is successfully converted, the returned length includes the zero termination, too.

◆ kRecFindLanguage()

RECERR RECAPIKRN kRecFindLanguage	(	LPCTSTR	pLangName,
		LANGUAGES *	pLanguage )

Searching for languages.

The kRecFindLanguage function searches for a single language with the given name.

Parameters

[in]	pLangName	The name of the language. It can be an English name, an ISO 639-3, ISO 639-2/B, ISO 639-1 or BCP 47 code, a Windows 3-letter code or a CSDK internal 3-letter code.
[out]	pLanguage	The index of the first language. If no language can be found, LANG_NO will be put in this parameter.

Return values

CHR_MULTIPLELANG_FOUND_WARN	More than one language has been found. The index of the most relevant language is put in the pLanguage parameter even in this case. Use one of the kRecFindLanguages or kRecFindLanguageEx functions to retrieve all the languages.
RECERR	Other error

Note: You might find kRecFindLanguageEx easier to use; that is the suggested function to call.; The specification of this function in C# is:
RECERR kRecFindLanguage(string pLangName, out LANGUAGES pLanguage);

kRecFindLanguage
RECERR RECAPIKRN kRecFindLanguage(LPCTSTR pLangName, LANGUAGES *pLanguage)
Searching for languages.

LANGUAGES
LANGUAGES
Possible languages.
Definition KernelApi.h:1106; The specification of this function in Java is:
int kRecFindLanguage(String pLangName, int[] pLanguage); The specification of this function in Python is:
def kRecFindLanguage(pLangName: str) -> Tuple[int, int]

◆ kRecFindLanguageEx()

RECERR RECAPIKRN kRecFindLanguageEx	(	LANGUAGE_CODE	coding,
		LPCTSTR	pLangName,
		LANGUAGES *	pLanguage,
		LANG_ENA *	pLanguagesOut )

Searching for languages.

The kRecFindLanguageEx function searches for the language or languages with the given name.

Parameters

[in]	coding	Look for the language using all or one of the language coding standards. See LANGUAGE_CODE for details.
[in]	pLangName	The name of the language to find.
[out]	pLanguage	Returns the index of the found language. If no language can be found, LANG_NO will be put in this parameter.
[out]	pLanguagesOut	This parameter can be NULL. If not NULL, must point to an array having LANG_SIZE elements. All the languages matching pLangName by the required language coding are enabled in this array.

Return values

CHR_MULTIPLELANG_FOUND_WARN	More than one language has been found. The index of the most relevant language is put in the pLanguage parameter even in this case, while pLanguagesOut (if not NULL) will contain all of them as LANG_ENABLED.
RECERR	Other error

Note: A special use of this function is when the first parameter (coding) is a Windows LangID (LCID) cast to LANGUAGE_CODE, and pLangName is NULL. In this case the language specified by the LangID will be returned in pLanguage (and pLanguagesOut, if not NULL).; This function is the successor of the kRecFindLanguage and kRecFindLanguages functions. kRecFindLanguage(langname, &lang) is equivalent to kRecFindLanguageEx(LANGCODE_ALL, langname, &lang, NULL), while you have more control with the kRecFindLanguageEx function.; The specification of this function in C# is:
RECERR kRecFindLanguageEx(LANGUAGE_CODE coding, string pLangName, out LANGUAGES pLanguage, out LANG_ENA[] pLanguagesOut);

// or when pLanguagesOut is NULL in C/C++

RECERR kRecFindLanguageEx(LANGUAGE_CODE coding, string pLangName, out LANGUAGES pLanguage);

LANG_ENA
LANG_ENA
Language enable/disable.
Definition KernelApi.h:1051

LANGUAGE_CODE
LANGUAGE_CODE
Language code type.
Definition KernelApi.h:1370

kRecFindLanguageEx
RECERR RECAPIKRN kRecFindLanguageEx(LANGUAGE_CODE coding, LPCTSTR pLangName, LANGUAGES *pLanguage, LANG_ENA *pLanguagesOut)
Searching for languages.; The specification of this function in Java is:
int kRecFindLanguageEx(LANGUAGE_CODE coding, String pLangName, int[] pLanguage, int[] pLanguagesOut); The specification of this function in Python is:
def kRecFindLanguageEx(coding: int, pLangName: str) -> Tuple[int, int, "IntArray"]

◆ kRecFindLanguages()

RECERR RECAPIKRN kRecFindLanguages	(	const LANGUAGE_INFO *	pInfo,
		LANG_ENA *	pLanguagesOut )

Searching for languages.

The kRecFindLanguages function collects languages according to the given language information.

Parameters

[in]	pInfo	Pointer to a structure containing the filter information. If a field is zero or an empty string, it does not affect the filtering. If a field has some real value, the only languages defined by that value will be reported on. The EnglishName field may contain not only the English name of the language, but the language identifier for any of the following standards: ISO 639-3, ISO 639-2/B, ISO 639-1, BCP 47, Windows 3-letter code or a CSDK internal 3-letter code.
[out]	pLanguagesOut	Pointer to an array having LANG_SIZE elements to give back whether a language is selected or not.

Return values

RECERR

Note: See also kRecFindLanguage and kRecFindLanguageEx. You might find kRecFindLanguageEx easier to use; that is the suggested function to call.; The specification of this function in C# is:
RECERR kRecFindLanguages(LANGUAGE_INFO pInfo, out LANG_ENA[] pLanguagesOut);

kRecFindLanguages
RECERR RECAPIKRN kRecFindLanguages(const LANGUAGE_INFO *pInfo, LANG_ENA *pLanguagesOut)
Searching for languages.

LANGUAGE_INFO
Language info.
Definition KernelApi.h:1350; The specification of this function in Java is:
int kRecFindLanguages(LANGUAGE_INFO pInfo, int[] pLanguagesOut); The specification of this function in Python is:
def kRecFindLanguages(pInfo: "LANGUAGE_INFO") -> Tuple[int, "IntArray"]

◆ kRecGetCodePage()

RECERR RECAPIKRN kRecGetCodePage	(	int	sid,
		LPTSTR	pCodePageName,
		size_t	buflen )

Getting the code page name.

The kRecGetCodePage function will provide the current Code Page name.

Parameters

[in]	sid	Settings Collection ID.
[out]	pCodePageName	Pointer of a buffer for the current setting.
[in]	buflen	Specifies the size of the buffer in bytes.

Return values

RECERR

Note: The length of any Code Page name never exceeds MAXCPNAMELEN characters (including the terminating zero).; This function gets the value of the setting Kernel.Chr.CodePage. This setting can be changed by kRecSetCodePage. If the value of the setting is -1, the function retrieves the default code page from the setting Kernel.Chr.CodePage.Default. See also auto code page.; The specification of this function in C# is:
RECERR kRecGetCodePage(int sid, out string pCodePageName);

kRecGetCodePage
RECERR RECAPIKRN kRecGetCodePage(int sid, LPTSTR pCodePageName, size_t buflen)
Getting the code page name.; The specification of this function in Java is:
int kRecGetCodePage(int sid, String[] pCodePageName); The specification of this function in Python is:
def kRecGetCodePage(sid: int) -> Tuple[int, str]

◆ kRecGetCodePageInfo()

RECERR RECAPIKRN kRecGetCodePageInfo	(	LPCTSTR	pCodePageName,
		LPTSTR	pDesc,
		size_t	size,
		LPOUTCODEPAGETYPE	pCodePageType )

Getting information about the code page.

The kRecGetCodePageInfo function provides information about the specified Code Page: a descriptive string and the category of the Code Page.

Parameters

[in]	pCodePageName	Name of the Code Page inquired.
[out]	pDesc	Pointer of a buffer to hold the Code Page descriptor information.
[in]	size	Specifies the size of the buffer `pDesc`, in bytes. (MAXCPNAMELEN is recommended)
[out]	pCodePageType	Pointer of a OUTCODEPAGETYPE variable to hold basic Code Page category information.

Return values

RECERR

Note: The specification of this function in C# is:
RECERR kRecGetCodePageInfo(string pCodePageName, out string pDesc, out OUTCODEPAGETYPE pCodePageType);

OUTCODEPAGETYPE
OUTCODEPAGETYPE
Code page types.
Definition KernelApi.h:1384

kRecGetCodePageInfo
RECERR RECAPIKRN kRecGetCodePageInfo(LPCTSTR pCodePageName, LPTSTR pDesc, size_t size, LPOUTCODEPAGETYPE pCodePageType)
Getting information about the code page.; The specification of this function in Java is:
int kRecGetCodePageInfo(String pCodePageName, String[] pDesc, OUTCODEPAGETYPE[] pCodePageType); The specification of this function in Python is:
def kRecGetCodePageInfo(pCodePageName: str) -> Tuple[int, str, int]

◆ kRecGetDefaultFilter()

RECERR RECAPIKRN kRecGetDefaultFilter	(	int	sid,
		CHR_FILTER *	pGlfilter )

Getting the global character set filter.

The kRecGetDefaultFilter function inquires the current Global filter setting. The result will be the binary OR-ed combination of one or more disjunct members of CHR_FILTER.

Parameters

[in]	sid	Settings Collection ID.
[out]	pGlfilter	Pointer of a variable to get the current Global filter setting.

Return values

RECERR

Note: This function gets the value of the setting Kernel.Chr.DefaultFilter. (This setting was not saved into a setting file in CSDK 15.); The specification of this function in C# is:
RECERR kRecGetDefaultFilter(int sid, out CHR_FILTER filter);

CHR_FILTER
CHR_FILTER
Recognition filters.
Definition KernelApi.h:730

kRecGetDefaultFilter
RECERR RECAPIKRN kRecGetDefaultFilter(int sid, CHR_FILTER *pGlfilter)
Getting the global character set filter.; The specification of this function in Java is:
int kRecGetDefaultFilter(int sid, CHR_FILTER[] pGlfilter); The specification of this function in Python is:
def kRecGetDefaultFilter(sid: int) -> Tuple[int, int]

◆ kRecGetFilterPlus()

RECERR RECAPIKRN kRecGetFilterPlus	(	int	sid,
		LPWSTR	pFilterPlus,
		size_t	iSize )

Getting FilterPlus characters.

The kRecGetFilterPlus gets the FilterPlus characters setting.

Parameters

[in]	sid	Settings Collection ID.
[out]	pFilterPlus	Pointer of a buffer to get the current FilterPlus character setting in 16-bit UNICODE.
[in]	iSize	Specifies the size of the buffer in bytes. It must be large enough to hold all the characters and the terminating wide-character zero.

Return values

RECERR

Note: This function gets the value of the setting Kernel.Chr.FilterPlus. (This setting was not saved into a setting file in CSDK 15.); The specification of this function in C# is:
RECERR kRecGetFilterPlus(int sid, StringBuilder pFilterPlus);

kRecGetFilterPlus
RECERR RECAPIKRN kRecGetFilterPlus(int sid, LPWSTR pFilterPlus, size_t iSize)
Getting FilterPlus characters.; The specification of this function in Java is:
int kRecGetFilterPlus(int sid, String[] pFilterPlus); The specification of this function in Python is:
def kRecGetFilterPlus(sid: int) -> Tuple[int, str]

◆ kRecGetFirstCodePage()

RECERR RECAPIKRN kRecGetFirstCodePage	(	LPTSTR	pCodePageName,
		size_t	buflen )

Starting enumeration of code pages.

The kRecGetFirstCodePage function together with the kRecGetNextCodePage creates a listing of the available Code Pages.

Parameters

[out]	pCodePageName	Pointer of a buffer for the name of the first available Code Page.
[in]	buflen	Specifies the size of the buffer in bytes.

Return values

RECERR

Note: The length of the Code Page names never exceeds MAXCPNAMELEN characters (including the terminating zero).; The Code Pages are either hard-coded in the Engine (Code Page 437, Roman 8, Greek-ELOT, Greek-MEMOTEK, all the supported Windows Code Pages and the 16-bit UNICODE and WordPerfect) or they are defined in the Code Page Definition files, *.SET. The OmniPage CSDK is shipped with the Code Page Definition file, called RECOGN.SET.; The specification of this function in C# is:
RECERR kRecGetFirstCodePage(out string pCodePageName);

kRecGetFirstCodePage
RECERR RECAPIKRN kRecGetFirstCodePage(LPTSTR pCodePageName, size_t buflen)
Starting enumeration of code pages.

There is a non-enumerating function replacing kRecGetFirstCodePage and kRecGetNextCodePage in C#:
RECERR kRecGetAllCodePages(out string[] codepages);; The specification of this function in Java is:
int kRecGetFirstCodePage(String[] pCodePageName); The specification of this function in Python is:
def kRecGetFirstCodePage() -> Tuple[int, str]

◆ kRecGetLanguageInfo()

RECERR RECAPIKRN kRecGetLanguageInfo	(	LANGUAGES	lang,
		LANGUAGE_INFO *	pInfo )

Getting information about a language.

The kRecGetLanguageInfo function inquires information about a language.

Parameters

[in]	lang	The ID of the inquired language.
[out]	pInfo	Pointer of a structure to give back information about the chosen language.

Return values

RECERR

Note: See the list of language identifiers for details.; Language names are available in ANSI encoding. You may use kRecConvertCodePage2UnicodeEx to convert to UTF-16.; The specification of this function in C# is:
RECERR kRecGetLanguageInfo(LANGUAGES lang, out LANGUAGE_INFO pInfo);

kRecGetLanguageInfo
RECERR RECAPIKRN kRecGetLanguageInfo(LANGUAGES lang, LANGUAGE_INFO *pInfo)
Getting information about a language.; The specification of this function in Java is:
int kRecGetLanguageInfo(int lang, LANGUAGE_INFO pInfo); The specification of this function in Python is:
def kRecGetLanguageInfo(lang: int) -> Tuple[int, "LANGUAGE_INFO"]

◆ kRecGetLanguages()

RECERR RECAPIKRN kRecGetLanguages	(	int	sid,
		LANG_ENA *	pLanguagesOut )

Getting languages.

The kRecGetLanguages function inquires the current language selection.

Parameters

[in]	sid	Settings Collection ID.
[out]	pLanguagesOut	Pointer to an array to get the current language selection. The size of the array must be LANG_SIZE. Each element of this array represents a language from LANGUAGES.

Return values

RECERR

Note: The Language environment is primarily determined by the language selection, optionally supplemented by the LanguagesPlus characters setting.; This function gets the value of the setting Kernel.Languages. This setting can be changed by kRecSetLanguages.; The specification of this function in C# is:
RECERR kRecGetLanguages(int sid, LANG_ENA[] pLanguagesOut);

kRecGetLanguages
RECERR RECAPIKRN kRecGetLanguages(int sid, LANG_ENA *pLanguagesOut)
Getting languages.; The specification of this function in Java is:
int kRecGetLanguages(int sid, int[] pLanguagesOut); The specification of this function in Python is:
def kRecGetLanguages(sid: int) -> Tuple[int, "IntArray"]

◆ kRecGetLanguagesPlus()

RECERR RECAPIKRN kRecGetLanguagesPlus	(	int	sid,
		LPWSTR	pOcrLplus,
		size_t	iBSize )

Getting LanguagesPlus characters.

The kRecGetLanguagesPlus function inquires the current LanguagesPlus characters setting.

Parameters

[in]	sid	Settings Collection ID.
[out]	pOcrLplus	Pointer to a buffer to get the current LanguagesPlus character setting in 16-bit UNICODE.
[in]	iBSize	Specifies the size of the buffer in bytes. It must be large enough to hold all the characters and the terminating wide-character zero.

Return values

RECERR

Note: This function gets the value of the setting Kernel.Chr.LanguagesPlus. (This setting was not saved into setting file in CSDK 15.); The specification of this function in C# is:
RECERR kRecGetLanguagesPlus(int sid, StringBuilder pOcrLplus);

kRecGetLanguagesPlus
RECERR RECAPIKRN kRecGetLanguagesPlus(int sid, LPWSTR pOcrLplus, size_t iBSize)
Getting LanguagesPlus characters.; The specification of this function in Java is:
int kRecGetLanguagesPlus(int sid, String[] pOcrLplus); The specification of this function in Python is:
def kRecGetLanguagesPlus(sid: int) -> Tuple[int, str]

◆ kRecGetMissingSymbol()

RECERR RECAPIKRN kRecGetMissingSymbol	(	int	sid,
		LPWCH	pwMiss )

Getting missing symbol character.

The kRecGetMissingSymbol function inquires the current missing symbol setting.

Parameters

[in]	sid	Settings Collection ID.
[out]	pwMiss	Pointer of a variable to get the missing symbol setting.

Return values

RECERR

Note: This function gets the value of the setting Kernel.Chr.Missing. This setting can be changed by kRecSetMissingSymbol.; The specification of this function in C# is:
RECERR kRecGetMissingSymbol(int sid, out char wMiss);

kRecGetMissingSymbol
RECERR RECAPIKRN kRecGetMissingSymbol(int sid, LPWCH pwMiss)
Getting missing symbol character.; The specification of this function in Java is:
int kRecGetMissingSymbol(int sid, char[] pwMiss); The specification of this function in Python is:
def kRecGetMissingSymbol(sid: int) -> Tuple[int, str]

◆ kRecGetNextCodePage()

RECERR RECAPIKRN kRecGetNextCodePage	(	LPTSTR	pCodePageName,
		size_t	buflen )

Performing enumeration of code pages.

The kRecGetNextCodePage function together with the kRecGetFirstCodePage creates a listing of the available Code Pages.

Parameters

[out]	pCodePageName	Pointer of a buffer for the name of the first available Code Page.
[in]	buflen	Specifies the size of the buffer in bytes.

Return values

RECERR

Note: The length of the Code Page names never exceeds MAXCPNAMELEN characters (including the terminating zero).; Repeated calls to this function get a sequence of the available Code Pages and generate a return code REC_OK each time. As soon as the function finds no further item to get, it returns with CHR_NOMORE_WARN, signaling that the list is complete.; The specification of this function in C# is:
RECERR kRecGetNextCodePage(out string pCodePageName);

kRecGetNextCodePage
RECERR RECAPIKRN kRecGetNextCodePage(LPTSTR pCodePageName, size_t buflen)
Performing enumeration of code pages.

There is a non-enumerating function replacing kRecGetFirstCodePage and kRecGetNextCodePage in C#:
RECERR kRecGetAllCodePages(out string[] codepages);; The specification of this function in Java is:
int kRecGetNextCodePage(String[] pCodePageName); The specification of this function in Python is:
def kRecGetNextCodePage() -> Tuple[int, str]

◆ kRecGetPageLanguages()

RECERR RECAPIKRN kRecGetPageLanguages	(	HPAGE	hPage,
		LANG_ENA *	pOcrLanguagesOut )

Getting languages of the page.

The kRecGetPageLanguages function inquires the language selection for a given page

Parameters

[in]	hPage	Handle of the page.
[out]	pOcrLanguagesOut	Pointer to an array to get the language selection. The size of the array must be LANG_SIZE. Each element of this array represents a language from LANGUAGES.

Return values

RECERR

Note

This function could be important when Automatic Single Language Detection is used. In this case it retrieves the result of the language detection process. Usually a single language is enabled in the pOcrLanguagesOut array, which is found to be most typical of the page. Sometimes, when language detection was not sure, even two languages could be enabled in the array.

This function could be called only after a processing function (like kRecPreprocessImg, kRecLocateZones or kRecRecognize) has been called with the page. Otherwise an error is returned.

If the language detection process could not determine the language of the page it fills pOcrLanguagesOut with the language of the previous page and returns the LANGDET_INHERITED_WARN warning code.

If single language detection is disabled it returns the same array that was returned by kRecGetLanguages at the time of the last processing function.

There is no corresponding kRecSetPageLanguages function. If you want to change the language to be used for recognizing a given page (e.g. after a failed single language detection) you must do the following steps:

disable automatic single language detection with kRecSetSingleLanguageDetection
set the language or languages with e.g. kRecManageLanguages
redo the processing functions starting with kRecPreprocessImg

The specification of this function in C# is:

RECERR kRecGetPageLanguages(IntPtr hPage, LANG_ENA[] pOcrLanguagesOut);

kRecGetPageLanguages

RECERR RECAPIKRN kRecGetPageLanguages(HPAGE hPage, LANG_ENA *pOcrLanguagesOut)

Getting languages of the page.

The specification of this function in Java is:

int kRecGetPageLanguages(HPAGE hPage, int[] pOcrLanguagesOut)

HPAGE

struct RECPAGESTRUCT * HPAGE

Handle of a page in memory.

Definition KernelApi.h:289

The specification of this function in Python is:

def kRecGetPageLanguages(hPage: "HPAGE") -> Tuple[int, "IntArray"]

◆ kRecGetRejectionSymbol()

RECERR RECAPIKRN kRecGetRejectionSymbol	(	int	sid,
		LPWCH	pwRej )

Getting the rejection symbol character.

The kRecGetRejectionSymbol function inquires the current rejection symbol setting.

Parameters

[in]	sid	Settings Collection ID.
[out]	pwRej	Pointer of a variable to get the rejection symbol setting.

Return values

RECERR

Note: This function gets the value of the setting Kernel.Chr.Rejected. This setting can be changed by kRecSetRejectionSymbol.; The specification of this function in C# is:
RECERR kRecGetRejectionSymbol(int sid, out char pwRej);

kRecGetRejectionSymbol
RECERR RECAPIKRN kRecGetRejectionSymbol(int sid, LPWCH pwRej)
Getting the rejection symbol character.; The specification of this function in Java is:
int kRecGetRejectionSymbol(int sid, char[] pwRej); The specification of this function in Python is:
def kRecGetRejectionSymbol(sid: int) -> Tuple[int, str]

◆ kRecGetSingleLanguageDetection()

RECERR RECAPIKRN kRecGetSingleLanguageDetection	(	int	sid,
		INTBOOL *	pbEnable )

Getting the single language detection flag.

The kRecGetSingleLanguageDetection function retrieves the value of the Automatic Single Language Detection setting.

Parameters

[in]	sid	Settings Collection ID.
[in]	pbEnable	Pointer of a variable to store the single language detection flag.

Return values

RECERR

Note: This function gets the value of the setting Kernel.OcrMgr.DetectSingleLanguage. This setting can be changed by kRecSetSingleLanguageDetection.; The specification of this function in C# is:
RECERR kRecGetSingleLanguageDetection(int sid, out bool bEnable);

kRecGetSingleLanguageDetection
RECERR RECAPIKRN kRecGetSingleLanguageDetection(int sid, INTBOOL *pbEnable)
Getting the single language detection flag.; The specification of this function in Java is:
int kRecGetSingleLanguageDetection(int sid, int[] pbEnable); The specification of this function in Python is:
def kRecGetSingleLanguageDetection(sid: int) -> Tuple[int, bool]

◆ kRecManageLanguages()

RECERR RECAPIKRN kRecManageLanguages	(	int	sid,
		MANAGE_LANG	action,
		LANGUAGES	language )

Managing enabled languages.

The kRecManageLanguages function performs some basic management actions (Set, Add, Remove, Invert, Is Enabled) on the Language environment.

Parameters

[in]	sid	Settings Collection ID.
[in]	action	The management action to perform. See the MANAGE_LANG enum.
[in]	language	The single language, or a language set, to enable, disable, or inquire. See the LANGUAGES enum.

Return values

RECERR

Note

This function is an easy to use alternative to the kRecSetLanguages and kRecGetLanguages functions. It is especially handy when you need to work with a single language only, but can also be used very well in other cases.

Example of language selection for multi-lingual documents (e.g. English and German):

rc = kRecManageLanguages(sid, SET_LANG, LANG_ENG);

rc = kRecManageLanguages(sid, ADD_LANG, LANG_GER);

kRecManageLanguages

RECERR RECAPIKRN kRecManageLanguages(int sid, MANAGE_LANG action, LANGUAGES language)

Managing enabled languages.

SET_LANG

@ SET_LANG

Definition KernelApi.h:1061

ADD_LANG

@ ADD_LANG

Definition KernelApi.h:1062

LANG_GER

@ LANG_GER

Definition KernelApi.h:1119

LANG_ENG

@ LANG_ENG

Definition KernelApi.h:1117

With the SET_LANG function the language parameter can be LANG_NO as well, which is equivalent to calling kRecSetLanuages with all the array elements being LANG_DISABLED.

The language parameter can be a language set identifier as well. This is only useful when automatic single language detection is used, see kRecSetSingleLanguageDetection. When automatic single language detection is enabled, the following values can be specified as well:

LANG_ALL_LATIN: enable all languages with the Latin script
LANG_ALL_ASIAN: enable all Asian languages (CCJK, Thai and Hebrew)
LANG_ALL: enable all the languages

See also the details of CCJK, Arabic and Hebrew language handling.

IS_LANG_ENABLED checks if the language is enabled or not. kRecManageLanguages returns REC_OK if the language is enabled while it returns CHR_LANG_DISABLED_WARN if disabled. If the language parameter is a language set identifier (like LANG_ALL_ASIAN) REC_OK is returned if at least one of the languages in the set is enabled.

The specification of this function in C# is:

RECERR kRecManageLanguages(int sid, MANAGE_LANG action, LANGUAGES language);

MANAGE_LANG

Language management actions.

Definition KernelApi.h:1060

The specification of this function in Java is:

int kRecManageLanguages(int sid, MANAGE_LANG action, int language)

The specification of this function in Python is:

def kRecManageLanguages(sid: int, action: int, language: int) -> int

◆ kRecSetCodePage()

RECERR RECAPIKRN kRecSetCodePage	(	int	sid,
		LPCTSTR	pCodePageName )

Setting the code page.

The kRecSetCodePage function specifies the Code Page setting of the Engine.

Parameters

[in]	sid	Settings Collection ID.
[in]	pCodePageName	Name of the Code Page to be set. The available Code Pages can be learnt using the kRecGetFirstCodePage and kRecGetNextCodePage function-pair. Auto code page can be selected by `NULL` or empty string.

Return values

RECERR

Note

The current Code Page setting is taken into account in four different cases:

when converting a character to its 16-bit UNICODE representation (kRecConvertCodePage2Unicode or kRecConvertCodePage2UnicodeEx),
when converting a character from its 16-bit UNICODE representation to the current Code Page (kRecConvertUnicode2CodePage or kRecConvertUnicode2CodePageEx),
when interpreting the content of a PDF417 barcode
when converting the recognition data to the final output document during the PID_WRITEFOUTDOC process.

For most languages you may find best to use one of the unicode codepages: "Unicode" or "UTF-8".

In order to keep the Code Page setting consistent for all processing steps, it is a good practice to specify the necessary Code Page setting BEFORE calling any Code Page related function, and not to change it until the final output document has been generated.

Selecting an 8-bit Code Page is getting less and less important; for document output use one of the unicode codepages. Otherwise the selected Code Page should be able to express all characters validated for recognition (i.e. the Character Set for all the zones). The kRecCheckCodePage function can be used to decide whether the current Code Page really fulfills this requirement.

When the document contains characters missing from the currently selected Code Page (those returned by the kRecCheckCodePage call), then they are recognized, but in the final output document will be replaced by the Missing symbol (see kRecSetMissingSymbol).

If this function is not called to specify the Code Page setting, the default value of this setting will be applied (Windows: code page of the current OS, Linux and MacOS: UTF-8).

This function sets the value of the setting Kernel.Chr.CodePage. Note that this setting is not a string, but an integer value. Do not save this number but use its string representation as the numerical code value of a given code page may change. Its string representation can be retrieved by kRecGetCodePage. (The functions kRecSaveSettings and kRecSettingSave save its value in a safe mode.)

The specification of this function in C# is:

RECERR kRecSetCodePage(int sid, string pCodePageName);

kRecSetCodePage

RECERR RECAPIKRN kRecSetCodePage(int sid, LPCTSTR pCodePageName)

Setting the code page.

The specification of this function in Java is:

int kRecSetCodePage(int sid, String pCodePageName)

The specification of this function in Python is:

def kRecSetCodePage(sid: int, pCodePageName: str) -> int

◆ kRecSetDefaultFilter()

RECERR RECAPIKRN kRecSetDefaultFilter	(	int	sid,
		CHR_FILTER	Glfilter )

Changing global character set filter.

The kRecSetDefaultFilter function specifies the Global filter, i.e. a Character Set filter which will be applied globally, at page level. If this function is not called by the integrating application after the Engine's initialization, the value FILTER_ALL is applied, i.e. the Language environment will not be filtered globally. The Global filter setting is applied for all zones having the FILTER_DEFAULT in their ZONE::filter field.

Parameters

[in]	sid	Settings Collection ID.
[in]	Glfilter	Global Character Set filter to be applied.

Return values

RECERR

Note: This function sets the value of the setting Kernel.Chr.DefaultFilter. (This setting was not saved into a setting file in CSDK 15.); If the Global filter is set to FILTER_DEFAULT by this function, the Global filter will reset to its default value FILTER_ALL.; All ZONE zones created by the kRecLocateZones function always have the FILTER_DEFAULT value in their ZONE::filter field.; Example for specifying a simple Language environment, in this case consisting of a language selection (English only) and no LanguagesPlus characters. Then a Global filter (Uppercase letters and digits only) is applied:
HPAGE hPage;

int sid = 0; // Settings Collection ID

. . .

rc = kRecManageLanguages(sid, SET_LANG, LANG_ENG);

rc = kRecLocateZones(sid, hPage, NULL);

// The function above locates the zones and gives them the FILTER_DEFAULT

// attribute

kRecSetDefaultFilter(sid, (CHR_FILTER)(FILTER_UPPERCASE | FILTER_DIGIT));

kRecSetDefaultFilter
RECERR RECAPIKRN kRecSetDefaultFilter(int sid, CHR_FILTER Glfilter)
Changing global character set filter.

FILTER_UPPERCASE
@ FILTER_UPPERCASE
Definition KernelApi.h:736

FILTER_DIGIT
@ FILTER_DIGIT
Definition KernelApi.h:734

kRecLocateZones
RECERR RECAPIKRN kRecLocateZones(int sid, HPAGE hPage)
Page parsing.; The specification of this function in C# is:
RECERR kRecSetDefaultFilter(int sid, CHR_FILTER filter);; The specification of this function in Java is:
int kRecSetDefaultFilter(int sid, CHR_FILTER Glfilter); The specification of this function in Python is:
def kRecSetDefaultFilter(sid: int, Glfilter: int) -> int

◆ kRecSetFilterPlus()

RECERR RECAPIKRN kRecSetFilterPlus	(	int	sid,
		LPCWSTR	pFilterPlus )

Setting FilterPlus characters.

The kRecSetFilterPlus function specifies a set of individual characters, the FilterPlus characters. The FilterPlus characters can broaden the filtered set of characters globally or on a per-zone basis. To allow the use of these FilterPlus characters, the zone's ZONE::filter field should have the FILTER_PLUS value enabled.

Parameters

[in]	sid	Settings Collection ID.
[in]	pFilterPlus	Pointer of a 16-bit UNICODE string containing the FilterPlus characters to be set. (The string is terminated with a wide-character zero.)

Return values

RECERR

Note: Using this function is equivalent to using
kRecSetFilterPlusExW(sid, 0, pFilterPlus);

See kRecSetFilterPlusEx for the description of the improved function.; This function sets the value of the setting Kernel.Chr.FilterPlus. (This setting was not saved into setting file in CSDK 15.); When characters to be added are contained in the ANSI (1252) Code Page, they can be defined on Windows as a wide-character-string literal, L"characters". If any needed character falls outside the ANSI (1252) Code Page, you can convert the string with the kRecConvertCodePage2UnicodeEx utility function, which converts character codes from UTF-8 or the current Code Page value (kRecSetCodePage) to their 16-bit UNICODE representation.; To broaden the Character Set by the defined characters, the filter value needed in the zone structure is: FILTER_ALL | FILTER_PLUS.; To add characters to a filtered language environment, the binary OR operator should be applied on FILTER_PLUS and the other filter values. The FilterPlus characters are added after the effect of the other filter elements. For example, to validate only letters and digits and the FilterPlus characters (with the punctuation and miscellaneous character categories disabled), the needed value is: FILTER_ALPHA | FILTER_DIGIT | FILTER_PLUS.; To validate only the FilterPlus characters, FILTER_PLUS must be the only filter element in the zone structure field. This disables even the letters validated by the language selection.; The specification of this function in C# is:
RECERR kRecSetFilterPlus(int sid, string pFilterPlus);

kRecSetFilterPlus
RECERR RECAPIKRN kRecSetFilterPlus(int sid, LPCWSTR pFilterPlus)
Setting FilterPlus characters.; The specification of this function in Java is:
int kRecSetFilterPlus(int sid, String pFilterPlus); The specification of this function in Python is:
def kRecSetFilterPlus(sid: int, pFilterPlus: str) -> int

◆ kRecSetFilterPlusEx()

RECERR RECAPIKRN kRecSetFilterPlusEx	(	int	sid,
		int	index,
		LPCTSTR	pFilterPlus )

Setting FilterPlus characters, extended.

The kRecSetFilterPlusEx function specifies a set of individual characters, the FilterPlus characters. The FilterPlus characters can broaden the filtered set of characters globally or on a per-zone basis. To allow the use of these FilterPlus characters, the zone's ZONE::filter field should have the FILTER_PLUS, FILTER_PLUS_1, FILTER_PLUS_2 or FILTER_PLUS_3 values enabled.

Parameters

[in]	sid	Settings Collection ID.
[in]	index	Specifies which set of FilterPlus characters to be set. Index 0 sets FILTER_PLUS, index 1 sets FILTER_PLUS_1, etc., up to FILTER_PLUS_3.
[in]	pFilterPlus	Pointer to a zero terminated string containing the FilterPlus characters to be set.

Return values

RECERR

Note: This function sets the value of the setting Kernel.Chr.FilterPlus, "Kernel.Chr.FilterPlus1", "Kernel.Chr.FilterPlus2" or "Kernel.Chr.FilterPlus3", depending on the index.; The character string can be either ASCII, UTF-8 or UTF-16 (Unicode) coded by using the kRecSetFilterPlusExA, kRecSetFilterPlusExU8 or kRecSetFilterPlusExW functions, respectively. Note that the older kRecSetFilterPlus function supports UTF-16 encoding only. The interpretation of the suffix-less kRecSetFilterPlusEx depends on the environment: On Windows it is either A or W, while on Linux and MacOS it is U8.; To broaden the Character Set by the defined characters for e.g. index 2, the filter value needed in the zone structure is: FILTER_ALL | FILTER_PLUS_2.; To add characters to a filtered language environment, the binary OR operator should be applied on the corresponding FILTER_PLUS filter and the other filter element values. The FilterPlus characters are added after the effect of the other filter elements. For example, to validate only letters and digits and the FilterPlus3 characters (with the punctuation and miscellaneous character categories disabled), the needed value is: FILTER_ALPHA | FILTER_DIGIT | FILTER_PLUS_3.; To validate only the FilterPlus characters, one of the FILTER_PLUS flags must be the only filter element in the zone structure field. This disables even the letters validated by the language selection.; Multiple FILTER_PLUS filters can be combined by ORing them. This way theoretically up to 16 FilterPlus character sets can be specified, but in practice it is easier to use the 4 uncombined ones only.; The specification of this function in C# is:
RECERR kRecSetFilterPlusEx(int sid, int index, string pFilterPlus);

kRecSetFilterPlusEx
RECERR RECAPIKRN kRecSetFilterPlusEx(int sid, int index, LPCTSTR pFilterPlus)
Setting FilterPlus characters, extended.; The specification of this function in Java is:
int kRecSetFilterPlusEx(int sid, int index, String pFilterPlus); The specification of this function in Python is:
def kRecSetFilterPlusEx(sid: int, index: int, pFilterPlus: str) -> int

◆ kRecSetLanguages()

RECERR RECAPIKRN kRecSetLanguages	(	int	sid,
		const LANG_ENA *	pLanguages )

Setting languages.

The kRecSetLanguages function defines the main part of the Language environment of the Character Set. The available languages are represented by the LANGUAGES enum.

Parameters

[in]	sid	Settings Collection ID.
[in]	pLanguages	Address of a LANG_ENA array containing the enabled/disabled information for each language available.

Return values

RECERR

Note: The Language environment is primarily determined by the language selection, optionally supplemented by the LanguagesPlus characters setting.; The language selection is specified through a LANG_ENA type array, where a LANG_ENABLED value means that the language represented by this element is supported. This array must have LANG_SIZE number of elements.; See also the details of CCJK, Arabic and Hebrew language handling.; The recognition module selected for recognition of a zone may also impose restrictions, e.g. the RER recognition module cannot handle Greek characters.; If this function is not called to specify the language(s) to be recognized, the default value, English language, is applied.; Setting the proper Language environment before calling kRecPreprocessImg is recommended to improve the efficiency of image preprocessing. Especially for Asian languages it is essential that the language be set before kRecPreprocessImg.; For selecting a single language (or few languages) you may find the kRecManageLanguages function handy.; Example of language selection for multi-lingual documents (e.g. English and German):
LANG_ENA pLang[LANG_SIZE];

for (int i=0; i<LANG_SIZE; i++) {

pLang[i] = LANG_DISABLED;

}

pLang[LANG_ENG] = LANG_ENABLED;

pLang[LANG_GER] = LANG_ENABLED;

rc = kRecSetLanguages(sid, pLang);

kRecSetLanguages
RECERR RECAPIKRN kRecSetLanguages(int sid, const LANG_ENA *pLanguages)
Setting languages.

LANG_DISABLED
@ LANG_DISABLED
Definition KernelApi.h:1052

LANG_ENABLED
@ LANG_ENABLED
Definition KernelApi.h:1053

LANG_SIZE
@ LANG_SIZE
Definition KernelApi.h:1275; Automatic Single Language Detection is available for cases when the page contains a single language but this language is not known. For details see also kRecSetSingleLanguageDetection.; This function sets the value of the setting Kernel.Languages. This setting can be retrieved by kRecGetLanguages.; The specification of this function in C# is:
RECERR kRecSetLanguages(int sid, LANG_ENA[] pLanguages);; The specification of this function in Java is:
int kRecSetLanguages(int sid, int[] pLanguages); The specification of this function in Python is:
def kRecSetLanguages(sid: int, pLanguages: "IntArray") -> int

◆ kRecSetLanguagesPlus()

RECERR RECAPIKRN kRecSetLanguagesPlus	(	int	sid,
		LPCWSTR	pOcrLplus )

Setting LanguagesPlus characters.

The kRecSetLanguagesPlus function specifies some individual characters, the LanguagesPlus characters. The set of LanguagesPlus characters is added to the set of characters determined by the language selection (kRecSetLanguages). The resulting set of characters is called the Language environment.

Parameters

[in]	sid	Settings Collection ID.
[in]	pOcrLplus	Pointer to a 16-bit UNICODE string containing the LanguagesPlus characters to be set. (The string is terminated with a wide-character zero.)

Return values

RECERR

Note: This function sets the value of the setting Kernel.Chr.LanguagesPlus. (This setting was not saved into setting file in CSDK 15.); This function is useful if a client believes the accented letters validated for a given language are insufficient, or for handling texts containing foreign words.; When non-selectable languages need to be handled, it is usually best to select English only and validate the needed accented letters individually.; When characters to be added are contained in the ANSI (1252) Code Page, the easiest way to define the string to be passed on Windows is with a wide-character-string literal, L"characters". If any needed character falls outside the ANSI (1252) Code Page, you can convert the string with the kRecConvertCodePage2UnicodeEx utility function which converts character codes from UTF-8 or the current Code Page value (kRecSetCodePage) to their 16-bit UNICODE representation.; Another typical use of the kRecSetLanguagesPlus function: two characters (Small E Acute and Capital E Acute) should be added to the German language:
WCHAR *pLangPlus = L"éÉ"; // Works this way on Windows only!

rc = kRecManageLanguages(sid, SET_LANG, LANG_GER);

rc = kRecSetLanguagesPlus(sid, pLangPlus);

kRecSetLanguagesPlus
RECERR RECAPIKRN kRecSetLanguagesPlus(int sid, LPCWSTR pOcrLplus)
Setting LanguagesPlus characters.; The specification of this function in C# is:
RECERR kRecSetLanguagesPlus(int sid, string pOcrLplus);; The specification of this function in Java is:
int kRecSetLanguagesPlus(int sid, String pOcrLplus); The specification of this function in Python is:
def kRecSetLanguagesPlus(sid: int, pOcrLplus: str) -> int

◆ kRecSetMissingSymbol()

RECERR RECAPIKRN kRecSetMissingSymbol	(	int	sid,
		WCHAR	wMiss )

Setting the missing symbol character.

The kRecSetMissingSymbol specifies the code of the missing symbol. The missing symbol is a special character that replaces any character that was recognized by the Engine but could not be represented in the final output document, since the character does not exist in the current Code Page.

Parameters

[in]	sid	Settings Collection ID.
[in]	wMiss	The missing symbol to be set.

Return values

RECERR

Note: During the conversion to the final output document the kRecConvert2DTXT function tries to find a replacement character with a similar shape for one not supported by the current Code Page. Only if this attempt was not successful, the missing symbol will be sent into the final output document.; If this function is not called the default missing character, L'^' (the CIRCUMFLEX character) will be applied.; It is best to use a keyboard character for the missing symbol, so that it is supported by all Code Pages. If the defined missing symbol is itself missing from the selected Code Page, a character with a similar shape is set and a CHR_MISSINGEXPORT_ERR error code is returned.; If the missing symbol to be set is contained within the ANSI 1252 Code Page, you can define it with a character literal, e.g.: L'#'. If it is not, a numeric UNICODE value should be given.; This function sets the value of the setting Kernel.Chr.Missing. This setting can be retrieved by kRecGetMissingSymbol.; The specification of this function in C# is:
RECERR kRecSetMissingSymbol(int sid, char wMiss);

kRecSetMissingSymbol
RECERR RECAPIKRN kRecSetMissingSymbol(int sid, WCHAR wMiss)
Setting the missing symbol character.; The specification of this function in Java is:
int kRecSetMissingSymbol(int sid, char wMiss); The specification of this function in Python is:
def kRecSetMissingSymbol(sid: int, wMiss: str) -> int

◆ kRecSetRejectionSymbol()

RECERR RECAPIKRN kRecSetRejectionSymbol	(	int	sid,
		WCHAR	wRej )

Setting the rejection symbol character.

The kRecSetRejectionSymbol function specifies which character is to be used as a symbol for the rejected characters (i.e. unrecognized by the recognition module used in the zone) in the final output document.

Parameters

[in]	sid	Settings Collection ID.
[in]	wRej	The rejection symbol to be set.

Return values

RECERR

Note: In the Engine there is a special internal code (UNICODE_REJECTED) for marking the unrecognized characters (called rejected characters). However, when exporting to the final output document, the recognition result must also contain this information. The character which will be used as a placeholder for rejected characters is the rejection symbol.; If this function is not called, the default rejection character, L'~' (the TILDE character) will be applied.; It is best to use a keyboard character for the rejection symbol, so that it is supported by all Code Pages. If the defined rejection character is missing from the selected Code Page, a character with a similar shape is set and a CHR_MISSINGEXPORT_ERR error code is returned.; If the rejection symbol to be set is contained within the ANSI 1252 Code Page, you can define it with a character literal, e.g.: L'#'. If it is not, a numeric UNICODE value should be given.; This function sets the value of the setting Kernel.Chr.Rejected. This setting can be retrieved by kRecGetRejectionSymbol.; The specification of this function in C# is:
RECERR kRecSetRejectionSymbol(int sid, char wRej);

kRecSetRejectionSymbol
RECERR RECAPIKRN kRecSetRejectionSymbol(int sid, WCHAR wRej)
Setting the rejection symbol character.; The specification of this function in Java is:
int kRecSetRejectionSymbol(int sid, char wRej); The specification of this function in Python is:
def kRecSetRejectionSymbol(sid: int, wRej: str) -> int

◆ kRecSetSingleLanguageDetection()

RECERR RECAPIKRN kRecSetSingleLanguageDetection	(	int	sid,
		INTBOOL	bEnable )

Automatic Single Language Detection.

The kRecSetSingleLanguageDetection function enables or disables Automatic Single Language Detection mode.

Parameters

[in]	sid	Settings Collection ID.
[in]	bEnable	Flag that indicates whether Automatic Single Language Detection is enabled or disabled.

Return values

RECERR

Note

The default value of this setting is FALSE. In this case the language of the document must be known and specified before calling the first processing function (typically kRecPreprocessImg). More than one languages can be specified even if the document contains text written in those languages, but it is suggested to use few (not more than 5) languages only. The recognition engine uses all those language dictionaries to recognize the words in the document.

When the language of the document is not known, automatic detection can be requested by calling this function with TRUE parameter and enabling all possible languages with the kRecSetLanguages or kRecManageLanguages functions.

When Automatic Single Language Detection is enabled, the engine works in a special mode: when the first processing function is called (typically kRecPreprocessImg) it tries to determine the language of the page by inspecting the letters and words found on it. The page may contain a single language only, but this language can be any Latin-alphabet or CCJK language or the Arabic one. Further processing in this mode uses the detected language to process the page. If the language cannot be determined, the language of the previous page will be used.

Latin-alphabet languages without dictionary are not supported. See the list of the language dictionaries of the OmniPage CSDK.

Automatic detection of Vietnamese, Thai and Hebrew languages are not supported. Very clean documents in Greek, Russian and other Cyrillic languages can be processed with Single Language Detection, but making your application depend on automatic detection of these languages is not encouraged.

To improve detection accuracy, it is worth limiting the set of possible languages by enabling only those languages in the language array that should be considered. There are some special LANG_ALL language set identifiers to make the selection of languages easier. For example, the following code selects all Latin-alphabet and all Asian languages to be used at single language detection:

rc = kRecSetSingleLanguageDetection(sid, TRUE);

rc = kRecManageLanguages(sid, SET_LANG, LANG_ALL_LATIN);

rc = kRecManageLanguages(sid, ADD_LANG, LANG_ALL_ASIAN);

kRecSetSingleLanguageDetection

RECERR RECAPIKRN kRecSetSingleLanguageDetection(int sid, INTBOOL bEnable)

Automatic Single Language Detection.

LANG_ALL_LATIN

@ LANG_ALL_LATIN

Definition KernelApi.h:1108

LANG_ALL_ASIAN

@ LANG_ALL_ASIAN

Definition KernelApi.h:1109

You can limit the set of languages to be considered in the automatic detection even more precisely. For example, the following code enables some Western plus the Japanese and Korean languages only:

rc = kRecSetSingleLanguageDetection(sid, TRUE);
rc = kRecManageLanguages(sid, SET_LANG, LANG_NO);
rc = kRecManageLanguages(sid, ADD_LANG, LANG_ENG);
rc = kRecManageLanguages(sid, ADD_LANG, LANG_GER);
rc = kRecManageLanguages(sid, ADD_LANG, LANG_FRE);
rc = kRecManageLanguages(sid, ADD_LANG, LANG_JPN);
rc = kRecManageLanguages(sid, ADD_LANG, LANG_KRN);

The kRecGetPageLanguages function can be used to learn the detected language of the page.

Automatic Single Language Detection is performed as a relatively fast initial processing step before character recognition, therefore its accuracy is very much dependent on the image quality and many other conditions. Use it mostly in unattended applications where the document's language is not known.

There is no individual function to detect the language of a page. If you want to learn the language, for example, to show it as a hint for the user before recognition, you must do the following steps:

enable automatic single language detection with kRecSetSingleLanguageDetection
set the languages to choose the single language from with e.g. kRecManageLanguages
call kRecPreprocessImg
learn the detected language by calling kRecGetPageLanguages

This function sets the value of the setting Kernel.OcrMgr.DetectSingleLanguage.

The specification of this function in C# is:

RECERR kRecSetSingleLanguageDetection(int sid, bool bEnable);

The specification of this function in Java is:

int kRecSetSingleLanguageDetection(int sid, int bEnable)

The specification of this function in Python is:

def kRecSetSingleLanguageDetection(sid: int, bEnable: bool) -> int

Classes

Macros

Typedefs

Enumerations

Functions

Codepage Conversion Mode flags for extended code page conversion

Detailed Description

Typedef Documentation

◆ LANG_ENA

◆ OUTCODEPAGETYPE

Enumeration Type Documentation

◆ BASIC_LANGUAGE_CHARSET

◆ CHR_FILTER

◆ CONTINENT

◆ LANG_ENA

◆ LANGUAGE_CODE

◆ LANGUAGES

◆ MANAGE_LANG

◆ OUTCODEPAGETYPE

◆ RM_FLAGS

Function Documentation

◆ kRecCheckCodePage()

◆ kRecConvertCodePage2Unicode()

◆ kRecConvertCodePage2UnicodeEx()

◆ kRecConvertUnicode2CodePage()

◆ kRecConvertUnicode2CodePageEx()

◆ kRecFindLanguage()

◆ kRecFindLanguageEx()

◆ kRecFindLanguages()

◆ kRecGetCodePage()

◆ kRecGetCodePageInfo()

◆ kRecGetDefaultFilter()

◆ kRecGetFilterPlus()

◆ kRecGetFirstCodePage()

◆ kRecGetLanguageInfo()

◆ kRecGetLanguages()

◆ kRecGetLanguagesPlus()

◆ kRecGetMissingSymbol()

◆ kRecGetNextCodePage()

◆ kRecGetPageLanguages()

◆ kRecGetRejectionSymbol()

◆ kRecGetSingleLanguageDetection()

◆ kRecManageLanguages()

◆ kRecSetCodePage()

◆ kRecSetDefaultFilter()

◆ kRecSetFilterPlus()

◆ kRecSetFilterPlusEx()

◆ kRecSetLanguages()

◆ kRecSetLanguagesPlus()

◆ kRecSetMissingSymbol()

◆ kRecSetRejectionSymbol()

◆ kRecSetSingleLanguageDetection()