158 lines
4.2 KiB
Bash
158 lines
4.2 KiB
Bash
SUMMARY="Language data files for Tesseract OCR engine"
|
|
DESCRIPTION="Tesseract OCR can be fully trained to recognize new languages and scripts. \
|
|
A set of files for, community made, trained languages are available as \
|
|
separate packages per language."
|
|
HOMEPAGE="https://github.com/tesseract-ocr/"
|
|
LICENSE="Apache v2"
|
|
COPYRIGHT="1985-1995 HP labs
|
|
2012 Google Inc."
|
|
REVISION="4"
|
|
SOURCE_URI="https://github.com/tesseract-ocr/tessdata/archive/$portVersion.tar.gz"
|
|
CHECKSUM_SHA256="5dcb37198336b6953843b461ee535df1401b41008d550fc9e43d0edabca7adb1"
|
|
SOURCE_DIR="tessdata-$portVersion"
|
|
DISABLE_SOURCE_PACKAGE=yes
|
|
|
|
ARCHITECTURES="any"
|
|
|
|
PROVIDES="
|
|
$portName = $portVersion
|
|
"
|
|
BUILD_REQUIRES="
|
|
"
|
|
|
|
declare -A languages
|
|
# Special data files
|
|
languages[osd]="orientation and script detection"
|
|
languages[equ]="math / equation detection"
|
|
|
|
# languages data files
|
|
languages[afr]="Afrikaans"
|
|
languages[amh]="Amharic"
|
|
languages[ara]="Arabic"
|
|
languages[asm]="Assamese"
|
|
languages[aze]="Azerbaijani"
|
|
languages[aze_cyrl]="Azerbaijani - Cyrilic"
|
|
languages[bel]="Belarusian"
|
|
languages[ben]="Bengali"
|
|
languages[bod]="Tibetan"
|
|
languages[bos]="Bosnian"
|
|
languages[bul]="Bulgarian"
|
|
languages[cat]="Catalan; Valencian"
|
|
languages[ceb]="Cebuano"
|
|
languages[ces]="Czech"
|
|
languages[chi_sim]="Chinese - Simplified"
|
|
languages[chi_tra]="Chinese - Traditional"
|
|
languages[chr]="Cherokee"
|
|
languages[cym]="Welsh"
|
|
languages[dan]="Danish"
|
|
languages[dan_frak]="Danish - Fraktur script"
|
|
languages[deu]="German"
|
|
languages[deu_frak]="Germain - Fraktur script"
|
|
languages[dzo]="Dzongkha"
|
|
languages[ell]="Greek, Modern (1453-)"
|
|
languages[eng]="English"
|
|
languages[enm]="English, Middle (1100-1500)"
|
|
languages[epo]="Esperanto"
|
|
languages[est]="Estonian"
|
|
languages[eus]="Basque"
|
|
languages[fas]="Persian"
|
|
languages[fin]="Finnish"
|
|
languages[fra]="French"
|
|
languages[frk]="Frankish"
|
|
languages[frm]="French, Middle (ca. 1400-1600)"
|
|
languages[gle]="Irish"
|
|
languages[glg]="Galician"
|
|
languages[grc]="Greek, Ancient (-1453)"
|
|
languages[guj]="Gujarati"
|
|
languages[hat]="Haitian; Haitian Creole"
|
|
languages[heb]="Hebrew"
|
|
languages[hin]="Hindi"
|
|
languages[hrv]="Croatian"
|
|
languages[hun]="Hungarian"
|
|
languages[iku]="Inuktitut"
|
|
languages[ind]="Indonesian"
|
|
languages[isl]="Icelandic"
|
|
languages[ita]="Italian"
|
|
languages[ita_old]="Italian - Old"
|
|
languages[jav]="Javanese"
|
|
languages[jpn]="Japanese"
|
|
languages[kan]="Kannada"
|
|
languages[kat]="Georgian"
|
|
languages[kat_old]="Georgian - Old"
|
|
languages[kaz]="Kazakh"
|
|
languages[khm]="Central Khmer"
|
|
languages[kir]="Kirghiz; Kyrgyz"
|
|
languages[kor]="Korean"
|
|
languages[kur]="Kurdish"
|
|
languages[lao]="Lao"
|
|
languages[lat]="Latin"
|
|
languages[lav]="Latvian"
|
|
languages[lit]="Lithuanian"
|
|
languages[mal]="Malayalam"
|
|
languages[mar]="Marathi"
|
|
languages[mkd]="Macedonian"
|
|
languages[mlt]="Maltese"
|
|
languages[msa]="Malay"
|
|
languages[mya]="Burmese"
|
|
languages[nep]="Nepali"
|
|
languages[nld]="Dutch; Flemish"
|
|
languages[nor]="Norvegian"
|
|
languages[ori]="Oriya"
|
|
languages[pan]="Panjabi; Punjabi"
|
|
languages[pol]="Polish"
|
|
languages[por]="Portuguese"
|
|
languages[pus]="Pushto; Pastho"
|
|
languages[ron]="Romanian; Moldavian; Moldovan"
|
|
languages[rus]="Russian"
|
|
languages[san]="Sanskrit"
|
|
languages[sin]="Sinhala; Sinhalese"
|
|
languages[slk]="Slovak"
|
|
languages[slk_frak]="Slovak - Fraktur script"
|
|
languages[slv]="Slovenian"
|
|
languages[spa]="Spanish; Castilian"
|
|
languages[spa_old]="Spanish; Castilian - Old"
|
|
languages[sqi]="Albanian"
|
|
languages[srp]="Serbian"
|
|
languages[srp_latn]="Serbian - Latin"
|
|
languages[swa]="Swahili"
|
|
languages[swe]="Swedish"
|
|
languages[syr]="Syriac"
|
|
languages[tam]="Tamil"
|
|
languages[tel]="Telugu"
|
|
languages[tgk]="Tajik"
|
|
languages[tgl]="Tagalog"
|
|
languages[tha]="Thai"
|
|
languages[tir]="Tigrinya"
|
|
languages[tur]="Turkish"
|
|
languages[uig]="Uighur; Uyghur"
|
|
languages[ukr]="Ukrainian"
|
|
languages[urd]="Urdu"
|
|
languages[uzb]="Uzbek"
|
|
languages[uzb_cyrl]="Uzbek - Cyrilic"
|
|
languages[vie]="Vietnamese"
|
|
languages[yid]="Yiddish"
|
|
|
|
for lang in "${!languages[@]}"; do
|
|
desc=${languages[${lang}]}
|
|
|
|
eval "\
|
|
SUMMARY_${lang}=\"Data files for ${desc}\";\
|
|
PROVIDES_${lang}=\"\
|
|
${portName}_${lang} = $portVersion\
|
|
\"; \
|
|
REQUIRES_${lang}=\"\
|
|
haiku\n\
|
|
vendor_tesseract >= 3\n\
|
|
\""
|
|
done
|
|
|
|
INSTALL()
|
|
{
|
|
mkdir -p $dataDir/tessdata
|
|
for lang in "${!languages[@]}"; do
|
|
cp $lang.* $dataDir/tessdata
|
|
packageEntries $lang \
|
|
$dataDir/tessdata/$lang.*
|
|
done
|
|
}
|