0
0
Fork 0
haikuports/app-text/tesseract_data/tesseract_data-3.04.00.recipe

158 lines
4.2 KiB
Bash

SUMMARY="Language data files for Tesseract OCR engine"
DESCRIPTION="Tesseract OCR can be fully trained to recognize new languages and scripts. \
A set of files for, community made, trained languages are available as \
separate packages per language."
HOMEPAGE="https://github.com/tesseract-ocr/"
LICENSE="Apache v2"
COPYRIGHT="1985-1995 HP labs
2012 Google Inc."
REVISION="4"
SOURCE_URI="https://github.com/tesseract-ocr/tessdata/archive/$portVersion.tar.gz"
CHECKSUM_SHA256="5dcb37198336b6953843b461ee535df1401b41008d550fc9e43d0edabca7adb1"
SOURCE_DIR="tessdata-$portVersion"
DISABLE_SOURCE_PACKAGE=yes
ARCHITECTURES="any"
PROVIDES="
$portName = $portVersion
"
BUILD_REQUIRES="
"
declare -A languages
# Special data files
languages[osd]="orientation and script detection"
languages[equ]="math / equation detection"
# languages data files
languages[afr]="Afrikaans"
languages[amh]="Amharic"
languages[ara]="Arabic"
languages[asm]="Assamese"
languages[aze]="Azerbaijani"
languages[aze_cyrl]="Azerbaijani - Cyrilic"
languages[bel]="Belarusian"
languages[ben]="Bengali"
languages[bod]="Tibetan"
languages[bos]="Bosnian"
languages[bul]="Bulgarian"
languages[cat]="Catalan; Valencian"
languages[ceb]="Cebuano"
languages[ces]="Czech"
languages[chi_sim]="Chinese - Simplified"
languages[chi_tra]="Chinese - Traditional"
languages[chr]="Cherokee"
languages[cym]="Welsh"
languages[dan]="Danish"
languages[dan_frak]="Danish - Fraktur script"
languages[deu]="German"
languages[deu_frak]="Germain - Fraktur script"
languages[dzo]="Dzongkha"
languages[ell]="Greek, Modern (1453-)"
languages[eng]="English"
languages[enm]="English, Middle (1100-1500)"
languages[epo]="Esperanto"
languages[est]="Estonian"
languages[eus]="Basque"
languages[fas]="Persian"
languages[fin]="Finnish"
languages[fra]="French"
languages[frk]="Frankish"
languages[frm]="French, Middle (ca. 1400-1600)"
languages[gle]="Irish"
languages[glg]="Galician"
languages[grc]="Greek, Ancient (-1453)"
languages[guj]="Gujarati"
languages[hat]="Haitian; Haitian Creole"
languages[heb]="Hebrew"
languages[hin]="Hindi"
languages[hrv]="Croatian"
languages[hun]="Hungarian"
languages[iku]="Inuktitut"
languages[ind]="Indonesian"
languages[isl]="Icelandic"
languages[ita]="Italian"
languages[ita_old]="Italian - Old"
languages[jav]="Javanese"
languages[jpn]="Japanese"
languages[kan]="Kannada"
languages[kat]="Georgian"
languages[kat_old]="Georgian - Old"
languages[kaz]="Kazakh"
languages[khm]="Central Khmer"
languages[kir]="Kirghiz; Kyrgyz"
languages[kor]="Korean"
languages[kur]="Kurdish"
languages[lao]="Lao"
languages[lat]="Latin"
languages[lav]="Latvian"
languages[lit]="Lithuanian"
languages[mal]="Malayalam"
languages[mar]="Marathi"
languages[mkd]="Macedonian"
languages[mlt]="Maltese"
languages[msa]="Malay"
languages[mya]="Burmese"
languages[nep]="Nepali"
languages[nld]="Dutch; Flemish"
languages[nor]="Norvegian"
languages[ori]="Oriya"
languages[pan]="Panjabi; Punjabi"
languages[pol]="Polish"
languages[por]="Portuguese"
languages[pus]="Pushto; Pastho"
languages[ron]="Romanian; Moldavian; Moldovan"
languages[rus]="Russian"
languages[san]="Sanskrit"
languages[sin]="Sinhala; Sinhalese"
languages[slk]="Slovak"
languages[slk_frak]="Slovak - Fraktur script"
languages[slv]="Slovenian"
languages[spa]="Spanish; Castilian"
languages[spa_old]="Spanish; Castilian - Old"
languages[sqi]="Albanian"
languages[srp]="Serbian"
languages[srp_latn]="Serbian - Latin"
languages[swa]="Swahili"
languages[swe]="Swedish"
languages[syr]="Syriac"
languages[tam]="Tamil"
languages[tel]="Telugu"
languages[tgk]="Tajik"
languages[tgl]="Tagalog"
languages[tha]="Thai"
languages[tir]="Tigrinya"
languages[tur]="Turkish"
languages[uig]="Uighur; Uyghur"
languages[ukr]="Ukrainian"
languages[urd]="Urdu"
languages[uzb]="Uzbek"
languages[uzb_cyrl]="Uzbek - Cyrilic"
languages[vie]="Vietnamese"
languages[yid]="Yiddish"
for lang in "${!languages[@]}"; do
desc=${languages[${lang}]}
eval "\
SUMMARY_${lang}=\"Data files for ${desc}\";\
PROVIDES_${lang}=\"\
${portName}_${lang} = $portVersion\
\"; \
REQUIRES_${lang}=\"\
haiku\n\
vendor_tesseract >= 3\n\
\""
done
INSTALL()
{
mkdir -p $dataDir/tessdata
for lang in "${!languages[@]}"; do
cp $lang.* $dataDir/tessdata
packageEntries $lang \
$dataDir/tessdata/$lang.*
done
}