From 22579a5ac64379b7544d2e4771682955c24002f8 Mon Sep 17 00:00:00 2001 From: Muhammad Adil Date: Sun, 3 May 2026 08:52:15 +0000 Subject: [PATCH] Add 4 ocr python tutorials MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Categories: general Source: AI Search API Tutorials: - extract text from image – OCR with Aspose AI Spell‑Check - Run OCR on image – Complete Guide to Structured Text Extraction - Python OCR Tutorial – Batch OCR Processing Made Easy - How to Batch OCR with Aspose OCR – Full Python Guide Auto-generated by Professionalize.Tutorials Agent --- .../_index.md | 212 +++++++++++++ .../_index.md | 215 +++++++++++++ .../_index.md | 300 ++++++++++++++++++ .../_index.md | 254 +++++++++++++++ .../_index.md | 228 +++++++++++++ .../_index.md | 213 +++++++++++++ .../_index.md | 296 +++++++++++++++++ .../_index.md | 252 +++++++++++++++ .../_index.md | 230 ++++++++++++++ .../_index.md | 215 +++++++++++++ .../_index.md | 298 +++++++++++++++++ .../_index.md | 236 ++++++++++++++ .../_index.md | 212 +++++++++++++ .../_index.md | 215 +++++++++++++ .../_index.md | 300 ++++++++++++++++++ .../_index.md | 254 +++++++++++++++ .../_index.md | 228 +++++++++++++ .../_index.md | 214 +++++++++++++ .../_index.md | 298 +++++++++++++++++ .../_index.md | 254 +++++++++++++++ .../_index.md | 231 ++++++++++++++ .../_index.md | 217 +++++++++++++ .../_index.md | 300 ++++++++++++++++++ .../_index.md | 258 +++++++++++++++ .../_index.md | 230 ++++++++++++++ .../_index.md | 216 +++++++++++++ .../_index.md | 299 +++++++++++++++++ .../_index.md | 237 ++++++++++++++ .../_index.md | 230 ++++++++++++++ .../_index.md | 217 +++++++++++++ .../_index.md | 300 ++++++++++++++++++ .../_index.md | 254 +++++++++++++++ .../_index.md | 210 ++++++++++++ .../_index.md | 215 +++++++++++++ .../_index.md | 300 ++++++++++++++++++ .../_index.md | 254 +++++++++++++++ .../_index.md | 228 +++++++++++++ .../_index.md | 212 +++++++++++++ .../_index.md | 296 +++++++++++++++++ .../_index.md | 252 +++++++++++++++ .../_index.md | 212 +++++++++++++ .../_index.md | 203 ++++++++++++ .../_index.md | 300 ++++++++++++++++++ .../_index.md | 255 +++++++++++++++ .../_index.md | 230 ++++++++++++++ .../_index.md | 216 +++++++++++++ .../_index.md | 299 +++++++++++++++++ .../_index.md | 255 +++++++++++++++ .../_index.md | 230 ++++++++++++++ .../_index.md | 214 +++++++++++++ .../_index.md | 299 +++++++++++++++++ .../_index.md | 255 +++++++++++++++ .../_index.md | 228 +++++++++++++ .../_index.md | 213 +++++++++++++ .../_index.md | 296 +++++++++++++++++ .../_index.md | 252 +++++++++++++++ .../_index.md | 228 +++++++++++++ .../_index.md | 213 +++++++++++++ .../_index.md | 298 +++++++++++++++++ .../_index.md | 254 +++++++++++++++ .../_index.md | 231 ++++++++++++++ .../_index.md | 217 +++++++++++++ .../_index.md | 300 ++++++++++++++++++ .../_index.md | 254 +++++++++++++++ .../_index.md | 230 ++++++++++++++ .../_index.md | 215 +++++++++++++ .../_index.md | 300 ++++++++++++++++++ .../_index.md | 254 +++++++++++++++ .../_index.md | 231 ++++++++++++++ .../_index.md | 216 +++++++++++++ .../_index.md | 300 ++++++++++++++++++ .../_index.md | 256 +++++++++++++++ .../_index.md | 230 ++++++++++++++ .../_index.md | 201 ++++++++++++ .../_index.md | 300 ++++++++++++++++++ .../_index.md | 254 +++++++++++++++ .../_index.md | 230 ++++++++++++++ .../_index.md | 215 +++++++++++++ .../_index.md | 298 +++++++++++++++++ .../_index.md | 254 +++++++++++++++ .../_index.md | 210 ++++++++++++ .../_index.md | 215 +++++++++++++ .../_index.md | 299 +++++++++++++++++ .../_index.md | 254 +++++++++++++++ .../_index.md | 230 ++++++++++++++ .../_index.md | 215 +++++++++++++ .../_index.md | 299 +++++++++++++++++ .../_index.md | 254 +++++++++++++++ .../_index.md | 231 ++++++++++++++ .../_index.md | 215 +++++++++++++ .../_index.md | 300 ++++++++++++++++++ .../_index.md | 254 +++++++++++++++ 92 files changed, 22792 insertions(+) create mode 100644 ocr/arabic/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/arabic/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/arabic/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/arabic/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/chinese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/chinese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/chinese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/chinese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/czech/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/czech/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/czech/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/czech/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/dutch/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/dutch/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/dutch/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/dutch/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/english/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/english/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/english/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/english/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/french/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/french/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/french/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/french/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/german/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/german/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/german/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/german/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/greek/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/greek/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/greek/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/greek/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/hindi/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/hindi/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/hindi/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/hindi/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/hongkong/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/hongkong/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/hongkong/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/hongkong/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/hungarian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/hungarian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/hungarian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/hungarian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/indonesian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/indonesian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/indonesian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/indonesian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/italian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/italian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/italian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/italian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/japanese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/japanese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/japanese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/japanese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/korean/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/korean/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/korean/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/korean/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/polish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/polish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/polish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/polish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/portuguese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/portuguese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/portuguese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/portuguese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/russian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/russian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/russian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/russian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/spanish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/spanish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/spanish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/spanish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/swedish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/swedish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/swedish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/swedish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/thai/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/thai/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/thai/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/thai/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/turkish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/turkish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/turkish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/turkish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md create mode 100644 ocr/vietnamese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md create mode 100644 ocr/vietnamese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md create mode 100644 ocr/vietnamese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md create mode 100644 ocr/vietnamese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md diff --git a/ocr/arabic/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/arabic/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..aa9483dd1 --- /dev/null +++ b/ocr/arabic/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,212 @@ +--- +category: general +date: 2026-05-03 +description: استخراج النص من الصورة باستخدام Aspose OCR وفحص الإملاء بالذكاء الاصطناعي. + تعلم كيفية إجراء OCR على الصورة، تحميل الصورة للـ OCR، التعرف على النص من الفاتورة + وإطلاق موارد وحدة معالجة الرسوميات. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: ar +og_description: استخراج النص من الصورة باستخدام Aspose OCR وتدقيق إملائي بالذكاء الاصطناعي. + دليل خطوة بخطوة يغطي كيفية إجراء OCR على الصورة، تحميل الصورة للـ OCR، وإطلاق موارد + وحدة معالجة الرسومات. +og_title: استخراج النص من الصورة – دليل شامل للتعرف الضوئي على الأحرف وتدقيق الإملاء +tags: +- OCR +- Aspose +- AI +- Python +title: استخراج النص من الصورة – OCR مع تدقيق إملائي AI من Aspose +url: /ar/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# استخراج النص من الصورة – دليل OCR كامل وتدقيق الإملاء + +هل احتجت يومًا إلى **استخراج النص من الصورة** لكنك لم تكن متأكدًا أي مكتبة ستوفر لك السرعة والدقة معًا؟ لست وحدك. في العديد من المشاريع الواقعية—مثل معالجة الفواتير، رقمنة الإيصالات، أو مسح العقود—الحصول على نص نظيف وقابل للبحث من صورة هو العائق الأول. + +الخبر السار هو أن Aspose OCR مقترن بنموذج Aspose AI خفيف الوزن يمكنه إنجاز هذه المهمة ببضع أسطر من بايثون. في هذا الدرس سنستعرض **كيفية OCR الصورة**، تحميل الصورة بشكل صحيح، تشغيل معالج تدقيق إملائي مدمج، وأخيرًا **تحرير موارد GPU** حتى يبقى تطبيقك صديقًا للذاكرة. + +بنهاية هذا الدليل ستتمكن من **التعرف على النص من صور الفواتير**، تصحيح أخطاء OCR الشائعة تلقائيًا، والحفاظ على نظافة GPU للدفعة التالية. + +--- + +## ما ستحتاجه + +- Python 3.9 أو أحدث (الكود يستخدم تلميحات النوع لكنه يعمل على إصدارات 3.x السابقة) +- حزم `aspose-ocr` و `aspose-ai` (التثبيت عبر `pip install aspose-ocr aspose-ai`) +- وجود GPU يدعم CUDA اختياري؛ سيعود السكريبت إلى CPU إذا لم يُعثر على GPU. +- صورة مثال، مثل `sample_invoice.png`، موجودة في مجلد يمكنك الإشارة إليه. + +لا أطر عمل ML ثقيلة، ولا تحميل نماذج ضخمة—فقط نموذج Q4‑K‑M مُكمًّى صغير يتناسب بسهولة مع معظم بطاقات GPU. + +## الخطوة 1: تهيئة محرك OCR – استخراج النص من الصورة + +أول ما تقوم به هو إنشاء كائن `OcrEngine` وتحديد اللغة المتوقعة. هنا نختار الإنجليزية ونطلب إخراج نص عادي، وهو مثالي للمعالجة اللاحقة. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**لماذا هذا مهم:** تحديد اللغة يحد من مجموعة الأحرف، مما يحسن الدقة. وضع النص العادي يزيل معلومات التخطيط التي عادةً لا تحتاجها عندما تريد فقط استخراج النص من الصورة. + +## الخطوة 2: تحميل الصورة لـ OCR – كيفية OCR الصورة + +الآن نقوم بتمرير صورة فعلية إلى المحرك. الدالة المساعدة `Image.load` تدعم الصيغ الشائعة (PNG، JPEG، TIFF) وتُبسط تفاصيل إدخال/إخراج الملفات. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**نصيحة:** إذا كانت صورك المصدرية كبيرة، فكر في تغيير حجمها قبل إرسالها إلى المحرك؛ الأبعاد الأصغر يمكن أن تقلل من استهلاك ذاكرة GPU دون الإضرار بجودة التعرف. + +## الخطوة 3: تكوين نموذج Aspose AI – التعرف على النص من الفاتورة + +Aspose AI يأتي بنموذج GGUF صغير يمكنك تنزيله تلقائيًا. يستخدم المثال مستودع `Qwen2.5‑3B‑Instruct‑GGUF`، مُكمًّى إلى `q4_k_m`. كما نخبر وقت التشغيل بتخصيص 20 طبقة على الـ GPU، مما يوازن بين السرعة واستخدام الذاكرة. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**خلف الكواليس:** النموذج المُكمًّى حجمه تقريبًا 1.5 GB على القرص، وهو جزء صغير من نموذج الدقة الكاملة، لكنه لا يزال يلتقط ما يكفي من الفروق اللغوية لتحديد الأخطاء الشائعة في OCR. + +## الخطوة 4: تهيئة AsposeAI وإرفاق معالج تدقيق الإملاء بعد المعالجة + +Aspose AI يتضمن معالج تدقيق إملائي جاهز بعد المعالجة. بإرفاقه، سيتم تنظيف كل نتيجة OCR تلقائيًا. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**لماذا نستخدم معالج ما بعد المعالجة؟** غالبًا ما يخطئ محرك OCR في قراءة “Invoice” كـ “Invo1ce” أو “Total” كـ “T0tal”. يقوم تدقيق الإملاء بتشغيل نموذج لغة خفيف على السلسلة الأصلية ويصحح تلك الأخطاء دون الحاجة إلى كتابة قاموس مخصص. + +## الخطوة 5: تشغيل معالج تدقيق الإملاء على نتيجة OCR + +مع ربط كل شيء، استدعاء واحد ينتج النص المصحح. نقوم أيضًا بطباعة النسختين الأصلية والمنقحة لتتمكن من رؤية التحسين. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +قد يبدو الناتج النموذجي لفاتورة كالتالي: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +لاحظ كيف تحولت “Invo1ce” إلى الكلمة الصحيحة “Invoice”. هذه هي قوة تدقيق الإملاء المدمج بالذكاء الاصطناعي. + +## الخطوة 6: تحرير موارد GPU – تحرير موارد GPU بأمان + +إذا كنت تشغل هذا في خدمة طويلة الأمد (مثل واجهة ويب API تعالج عشرات الفواتير في الدقيقة)، يجب تحرير سياق GPU بعد كل دفعة. وإلا ستواجه تسربات في الذاكرة وفي النهاية أخطاء “CUDA out of memory”. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**نصيحة احترافية:** استدعِ `free_resources()` داخل كتلة `finally` أو مدير سياق لضمان تنفيذه دائمًا، حتى إذا حدث استثناء. + +## مثال كامل يعمل + +جمع كل الأجزاء معًا يمنحك سكريبتًا مستقلًا يمكنك إدراجه في أي مشروع. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +احفظ الملف، عدل مسار الصورة، وشغّل `python extract_text_from_image.py`. يجب أن ترى نص الفاتورة المنقح يُطبع في وحدة التحكم. + +## الأسئلة المتكررة (FAQ) + +**س: هل يعمل هذا على أجهزة لا تحتوي على GPU؟** +ج: بالتأكيد. إذا لم يتم اكتشاف GPU، فإن Aspose AI ينتقل إلى التنفيذ على CPU، رغم أنه سيكون أبطأ. يمكنك فرض الاستخدام على CPU بتعيين `model_cfg.gpu_layers = 0`. + +**س: ماذا لو كانت فواتيري بلغة غير الإنجليزية؟** +ج: غيّر `ocr_engine.language` إلى القيمة المناسبة من الـ enum (مثلاً `aocr.Language.Spanish`). نموذج تدقيق الإملاء متعدد اللغات، لكن قد تحصل على نتائج أفضل باستخدام نموذج مخصص للغة. + +**س: هل يمكنني معالجة صور متعددة في حلقة؟** +ج: نعم. فقط انقل خطوات التحميل، التعرف، وما بعد المعالجة داخل حلقة `for`. تذكر استدعاء `ocr_ai.free_resources()` بعد الحلقة أو بعد كل دفعة إذا كنت تعيد استخدام نفس نسخة AI. + +**س: ما حجم تحميل النموذج؟** +ج: تقريبًا 1.5 GB للإصدار المُكمًّى `q4_k_m`. يتم تخزينه مؤقتًا بعد التشغيل الأول، لذا فإن التنفيذ اللاحق يكون فوريًا. + +## الخلاصة + +في هذا الدرس أظهرنا كيفية **استخراج النص من الصورة** باستخدام Aspose OCR، تكوين نموذج AI صغير، تطبيق معالج تدقيق إملائي بعد المعالجة، وتحرير موارد GPU بأمان. يغطي سير العمل كل شيء من تحميل الصورة إلى تنظيف الموارد بعد الانتهاء، مما يمنحك خط أنابيب موثوقًا لسيناريوهات **التعرف على النص من الفاتورة**. + +الخطوة التالية؟ جرّب استبدال تدقيق الإملاء بنموذج استخراج كيانات مخصص + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/arabic/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/arabic/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..cdaf36c25 --- /dev/null +++ b/ocr/arabic/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,215 @@ +--- +category: general +date: 2026-05-03 +description: كيفية معالجة مجموعة من الصور باستخدام تقنية OCR من Aspose وتدقيق إملائي + بالذكاء الاصطناعي. تعلّم استخراج النص من الصور، تطبيق التدقيق الإملائي، الاستفادة + من موارد الذكاء الاصطناعي المجانية وتصحيح أخطاء OCR. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: ar +og_description: كيفية معالجة مجموعة من الصور باستخدام Aspose OCR وفحص الإملاء بالذكاء + الاصطناعي. اتبع دليلًا خطوة بخطوة لاستخراج النص من الصور، وتطبيق فحص الإملاء، واستخدام + موارد الذكاء الاصطناعي المجانية وتصحيح أخطاء OCR. +og_title: كيفية تنفيذ OCR دفعيًا باستخدام Aspose OCR – دليل بايثون الكامل +tags: +- OCR +- Python +- AI +- Aspose +title: كيفية تنفيذ OCR دفعيًا باستخدام Aspose OCR – دليل بايثون الكامل +url: /ar/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# كيفية تنفيذ OCR دفعيًا باستخدام Aspose OCR – دليل Python كامل + +هل تساءلت يومًا **كيف تقوم بعمل OCR دفعيًا** لمجلد كامل من ملفات PDF أو الصور الممسوحة ضوئيًا دون كتابة سكريبت منفصل لكل ملف؟ أنت لست وحدك. في العديد من خطوط الأنابيب الواقعية تحتاج إلى **استخراج النص من الصور**، تصحيح الأخطاء الإملائية، وأخيرًا تحرير أي موارد AI قد خصصتها. يوضح لك هذا الدليل بالضبط كيفية القيام بذلك باستخدام Aspose OCR، معالج AI خفيف الوزن، وبعض أسطر Python. + +سنستعرض عملية تهيئة محرك OCR، ربط مدقق إملائي AI، التكرار عبر دليل الصور، وتنظيف النموذج بعد ذلك. في النهاية ستحصل على سكريبت جاهز للتشغيل يقوم **بتصحيح أخطاء OCR** تلقائيًا ويحرر **موارد AI مجانية** بحيث يبقى GPU الخاص بك سعيدًا. + +## ما ستحتاجه + +- Python 3.9+ (الكود يستخدم type‑hints لكنه يعمل على إصدارات 3.x السابقة) +- حزمة `asposeocr` (`pip install asposeocr`) – توفر محرك OCR. +- الوصول إلى نموذج Hugging Face `bartowski/Qwen2.5-3B-Instruct-GGUF` (يتم تنزيله تلقائيًا). +- وحدة معالجة رسومية (GPU) بذاكرة VRAM لا تقل عن بضع جيجابايت (السكريبت يحدد `gpu_layers = 30`، يمكنك تقليلها إذا لزم الأمر). + +لا توجد خدمات خارجية، ولا واجهات برمجة تطبيقات مدفوعة – كل شيء يعمل محليًا. + +--- + +## الخطوة 1: إعداد محرك OCR – **كيف تقوم بعمل OCR دفعيًا** بفعالية + +قبل أن نتمكن من معالجة ألف صورة نحتاج إلى محرك OCR قوي. يتيح لنا Aspose OCR اختيار اللغة ووضعية التعرف في استدعاء واحد. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**لماذا هذا مهم:** ضبط `recognize_mode` إلى `Plain` يحافظ على خفة الناتج، وهو مثالي عندما تخطط لتشغيل مدقق إملائي لاحقًا. إذا كنت بحاجة إلى معلومات التخطيط، يمكنك التحويل إلى `Layout`، لكن ذلك يضيف عبئًا قد لا ترغب به في وظيفة دفعية. + +> **نصيحة احترافية:** إذا كنت تتعامل مع مسحات متعددة اللغات، يمكنك تمرير قائمة مثل `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## الخطوة 2: تهيئة معالج AI بعد‑المعالجة – **تطبيق التدقيق الإملائي** على ناتج OCR + +يأتي Aspose AI مع معالج بعد‑معالجة مدمج يمكنه تشغيل أي نموذج تريده. هنا نقوم بجلب نموذج Qwen 2.5 مضغوط من Hugging Face وربط روتين التدقيق الإملائي. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**لماذا هذا مهم:** النموذج مضغوط (`q4_k_m`)، مما يقلل استهلاك الذاكرة مع الحفاظ على فهم لغوي جيد. باستدعاء `set_post_processor` نخبر Aspose AI بتنفيذ خطوة **تطبيق التدقيق الإملائي** تلقائيًا على أي سلسلة نمررها. + +> **احذر:** إذا لم تستطع وحدة GPU الخاصة بك التعامل مع 30 طبقة، قلل العدد إلى 15 أو حتى 5 – سيظل السكريبت يعمل، لكنه سيكون أبطأ قليلاً. + +--- + +## الخطوة 3: تشغيل OCR و**تصحيح أخطاء OCR** على صورة واحدة + +الآن بعد أن أصبح كل من محرك OCR ومدقق الإملائي AI جاهزين، نجمعهما. هذه الدالة تحمل صورة، تستخرج النص الخام، ثم تشغل معالج AI بعد‑المعالجة لتنظيفه. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**لماذا هذا مهم:** إدخال سلسلة OCR الخام مباشرة إلى نموذج AI يمنحنا خطوة **تصحيح أخطاء OCR** **دون كتابة أي تعبيرات regex أو قواميس مخصصة**. النموذج يفهم السياق، لذا يمكنه تصحيح “recieve” → “receive” وحتى الأخطاء الأكثر دقة. + +--- + +## الخطوة 4: **استخراج النص من الصور** بالجملة – الحلقة الفعلية للمعالجة الدفعية + +هنا يبرز سحر **كيفية تنفيذ OCR دفعيًا**. نقوم بالتكرار عبر دليل، نتخطى الملفات غير المدعومة، ونكتب كل ناتج مصحح إلى ملف `.txt`. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### الناتج المتوقع + +لصورة تحتوي على الجملة *“The quick brown fox jumps over the lazzy dog.”* سترى ملف نصي يحتوي على: + +``` +The quick brown fox jumps over the lazy dog. +``` + +لاحظ أن الحرف “z” المزدوج تم تصحيحه تلقائيًا – هذا هو تدقيق الإملائي AI في العمل. + +**لماذا هذا مهم:** بإنشاء كائنات OCR وAI **مرة واحدة** وإعادة استخدامها، نتجنب عبء تحميل النموذج لكل ملف. هذه هي الطريقة الأكثر كفاءة لـ **كيفية تنفيذ OCR دفعيًا** على نطاق واسع. + +--- + +## الخطوة 5: التنظيف – **تحرير موارد AI** بشكل صحيح + +عند الانتهاء، استدعاء `free_resources()` يحرر ذاكرة GPU، وسياقات CUDA، وأي ملفات مؤقتة أنشأها النموذج. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +تجاوز هذه الخطوة قد يترك تخصيصات GPU معلقة، مما قد يتسبب في تعطل عمليات Python اللاحقة أو استهلاك VRAM. فكر فيها كجزء “إطفاء الأنوار” في وظيفة دفعية. + +--- + +## المشكلات الشائعة والنصائح الإضافية + +| المشكلة | ما الذي يجب البحث عنه | الحل | +|-------|------------------|-----| +| **أخطاء نفاد الذاكرة** | GPU ينفد بعد بضع عشرات من الصور | قلل `gpu_layers` أو انتقل إلى CPU (`model_cfg.gpu_layers = 0`). | +| **حزمة اللغة مفقودة** | OCR يُرجع سلاسل فارغة | تأكد من أن نسخة `asposeocr` تشمل بيانات اللغة الإنجليزية؛ أعد التثبيت إذا لزم الأمر. | +| **ملفات غير صورة** | السكريبت يتعطل عند وجود ملف `.pdf` عشوائي | الحارس `if not file_name.lower().endswith(...)` يتخطى هذه الملفات بالفعل. | +| **التدقيق الإملائي غير مطبق** | الناتج يبدو مطابقًا لـ OCR الخام | تحقق من استدعاء `ai_processor.set_post_processor` قبل الحلقة. | +| **بطء سرعة المعالجة الدفعية** | يستغرق أكثر من 5 ثوانٍ لكل صورة | فعّل `model_cfg.allow_auto_download = "false"` بعد التشغيل الأول، حتى لا يتم إعادة تنزيل النموذج في كل مرة. | + +**نصيحة احترافية:** إذا كنت بحاجة إلى **استخراج النص من الصور** بلغة غير الإنجليزية، ببساطة غير `ocr_engine.language` إلى الـ enum المناسب (مثال: `aocr.Language.French`). سيستمر نفس معالج AI بعد‑المعالجة في تطبيق التدقيق الإملائي، لكن قد ترغب في نموذج مخصص للغة للحصول على أفضل النتائج. + +--- + +## ملخص وخطوات قادمة + +لقد غطينا كامل خط الأنابيب لـ **كيفية تنفيذ OCR دفعيًا**: + +1. **تهيئة** محرك OCR نص عادي للإنجليزية. +2. **تكوين** نموذج تدقيق إملائي AI وربطه كمعالج بعد‑معالجة. +3. **تشغيل** OCR على كل صورة والسماح للـ AI **بتصحيح أخطاء OCR** تلقائيًا. +4. **التكرار** عبر دليل لـ **استخراج النص من الصور** بالجملة. +5. **تحرير موارد AI** بمجرد انتهاء المهمة. + +من هنا يمكنك: + +- تمرير النص المصحح إلى خط أنابيب NLP لاحق (تحليل المشاعر، استخراج الكيانات، إلخ). +- استبدال معالج التدقيق الإملائي بمعالج تلخيص مخصص عبر استدعاء `ai_processor.set_post_processor(your_custom_func, {})`. +- موازاة حلقة المجلد باستخدام `concurrent.futures.ThreadPoolExecutor` إذا كان GPU الخاص بك يستطيع التعامل مع تدفقات متعددة. + +--- + +## أفكار نهائية + +ليس من الضروري أن تكون معالجة OCR دفعيًا مهمة شاقة. من خلال الاستفادة من Aspose OCR مع نموذج AI خفيف الوزن، تحصل على **حل شامل** يـ **يستخرج النص من الصور**، **يطبق التدقيق الإملائي**، **يصّح أخطاء OCR**، و**يحرّر موارد AI** بنظافة. جرّب السكريبت على مجلد اختبار، اضبط عدد طبقات GPU ليتناسب مع عتادك، وستحصل على خط أنابيب جاهز للإنتاج في دقائق. + +هل لديك أسئلة حول تعديل النموذج، معالجة ملفات PDF، أو دمجه في خدمة ويب؟ اترك تعليقًا أدناه أو راسلني على GitHub. برمجة سعيدة، ولتكن OCR دائمًا دقيقة! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/arabic/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/arabic/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..be71facb3 --- /dev/null +++ b/ocr/arabic/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,300 @@ +--- +category: general +date: 2026-05-03 +description: دورة بايثون لتقنية OCR تُظهر كيفية تحميل ملفات PNG، والتعرف على النص + من الصورة، وموارد الذكاء الاصطناعي المجانية لمعالجة OCR على دفعات. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: ar +og_description: يُرشدك برنامج تعليمي OCR بلغة بايثون إلى تحميل صور PNG، والتعرف على + النص من الصورة، والتعامل مع موارد الذكاء الاصطناعي المجانية لمعالجة OCR على دفعات. +og_title: دليل بايثون للتعرف الضوئي على الأحرف – التعرف الضوئي السريع على دفعات باستخدام + موارد الذكاء الاصطناعي المجانية +tags: +- OCR +- Python +- AI +title: دورة بايثون للتعرف الضوئي على الأحرف – معالجة التعرف الضوئي على الأحرف الدفعيّة + بسهولة +url: /ar/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# دليل OCR بايثون – معالجة OCR دفعة واحدة بسهولة + +هل احتجت يوماً إلى **python ocr tutorial** يتيح لك فعلياً تشغيل OCR على العشرات من ملفات PNG دون أن تشعر بالإحباط؟ لست وحدك. في العديد من المشاريع الواقعية عليك **load png image** ملفات، وتغذيتها إلى محرك، ثم تنظيف موارد AI عندما تنتهي. + +في هذا الدليل سنستعرض مثالاً كاملاً وجاهزاً للتنفيذ يوضح بالضبط كيفية **recognize text from image** الملفات، ومعالجتها دفعة واحدة، وتحرير ذاكرة AI الأساسية. في النهاية ستحصل على سكريبت مستقل يمكنك إدراجه في أي مشروع—بدون إضافات غير ضرورية، فقط الأساسيات. + +## ما ستحتاجه + +- Python 3.10 أو أحدث (الصياغة المستخدمة هنا تعتمد على f‑strings وتلميحات الأنواع) +- مكتبة OCR تُظهر طريقة `engine.recognize` – لأغراض العرض سنفترض حزمة خيالية `aocr`، لكن يمكنك استبدالها بـ Tesseract أو EasyOCR، إلخ. +- وحدة المساعدة `ai` المعروضة في مقتطف الشيفرة (تتعامل مع تهيئة النموذج وتنظيف الموارد) +- مجلد يحتوي على ملفات PNG تريد معالجتها + +إذا لم يكن لديك `aocr` أو `ai` مثبتين، يمكنك محاكاتهما باستخدام stubs – راجع قسم “Optional Stubs” قرب النهاية. + +## الخطوة 1: تهيئة محرك AI (تحرير موارد AI) + +قبل أن تغذي أي صورة إلى خط أنابيب OCR، يجب أن يكون النموذج الأساسي جاهزاً. التهيئة مرة واحدة فقط توفر الذاكرة وتسرّع وظائف الدفعات. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**لماذا هذا مهم:** +استدعاء `ai.initialize` بشكل متكرر لكل صورة سيؤدي إلى تخصيص ذاكرة GPU مراراً وتكراراً، مما سيتسبب في تعطل السكريبت في النهاية. من خلال التحقق من `ai.is_initialized()` نضمن تخصيصاً واحداً فقط – وهذا هو مبدأ “تحرير موارد AI”. + +## الخطوة 2: تحميل ملفات PNG لمعالجة OCR دفعة واحدة + +الآن نجمع جميع ملفات PNG التي نريد تشغيلها عبر OCR. استخدام `pathlib` يحافظ على استقلالية الكود عن نظام التشغيل. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**حالة حافة:** +إذا كان المجلد يحتوي على ملفات غير PNG (مثل JPEG) فسيتم تجاهلها، مما يمنع `engine.recognize` من التعطل بسبب تنسيق غير مدعوم. + +## الخطوة 3: تشغيل OCR على كل صورة وتطبيق المعالجة اللاحقة + +مع جاهزية المحرك وإعداد قائمة الملفات، يمكننا التكرار على الصور، استخراج النص الخام، وتسليمه إلى معالج لاحق يقوم بتنظيف الأخطاء الشائعة في OCR (مثل فواصل الأسطر العشوائية). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**لماذا نفصل التحميل عن التعرف:** +قد يقوم `aocr.Image.load` بعملية فك ترميز كسولة، مما يكون أسرع للدفعات الكبيرة. إبقاء خطوة التحميل صريحة يجعل من السهل استبدال مكتبة الصور إذا احتجت لاحقاً للتعامل مع ملفات JPEG أو TIFF. + +## الخطوة 4: التنظيف – تحرير موارد AI بعد الانتهاء من الدفعة + +بعد انتهاء الدفعة، يجب تحرير النموذج لتجنب تسرب الذاكرة، خاصةً على الأجهزة التي تدعم GPU. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## تجميع كل شيء معاً – السكريبت الكامل + +فيما يلي ملف واحد يجمع الخطوات الأربع في سير عمل متكامل. احفظه باسم `batch_ocr.py` وشغّله من سطر الأوامر. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### النتيجة المتوقعة + +تشغيل السكريبت على مجلد يحتوي على ثلاث صور PNG قد يطبع: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +ملف `ocr_results.txt` سيحتوي على فاصل واضح لكل صورة يليه النص المنظف من OCR. + +## stubs اختيارية لـ aocr و ai (إذا لم تتوفر لديك حزم حقيقية) + +إذا كنت ترغب فقط في اختبار التدفق دون استدعاء مكتبات OCR الضخمة، يمكنك إنشاء وحدات mock بسيطة: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +ضع هذه المجلدات بجوار `batch_ocr.py` وسيعمل السكريبت، مطبعاً نتائج mock. + +## نصائح احترافية ومشكلات شائعة + +- **Memory spikes:** إذا كنت تعالج آلاف ملفات PNG عالية الدقة، فكر في تغيير حجمها قبل OCR. غالباً ما يقبل `aocr.Image.load` معامل `max_size`. +- **Unicode handling:** افتح دائمًا ملف الإخراج باستخدام `encoding="utf-8"`؛ يمكن لمحركات OCR إنتاج أحرف غير ASCII. +- **Parallelism:** بالنسبة لـ OCR المعتمد على CPU يمكنك تغليف `ocr_batch` في `concurrent.futures.ThreadPoolExecutor`. فقط تذكر الحفاظ على نسخة واحدة من `ai` – إنشاء العديد من الخيوط التي تستدعي كل منها `ai.initialize` يتعارض مع هدف “تحرير موارد AI”. +- **Error resilience:** غلف حلقة الصور الفردية داخل كتلة `try/except` حتى لا يتسبب PNG تالف واحد في إيقاف الدفعة بأكملها. + +## الخلاصة + +أصبحت الآن تمتلك **python ocr tutorial** يوضح كيفية **load png image** الملفات، تنفيذ **batch OCR processing**، وإدارة **free AI resources** بمسؤولية. المثال الكامل القابل للتنفيذ يوضح بالضبط كيفية **recognize text from image** الكائنات وتنظيف الموارد بعد ذلك، بحيث يمكنك نسخه ولصقه في مشاريعك دون البحث عن أجزاء مفقودة. + +هل أنت مستعد للخطوة التالية؟ جرّب استبدال وحدات `aocr` و `ai` الوهمية بمكتبات حقيقية مثل `pytesseract` و `torchvision`. يمكنك أيضًا توسيع السكريبت لإنتاج JSON، أو إرسال النتائج إلى قاعدة بيانات، أو دمجه مع دلو تخزين سحابي. السماء هي الحد—برمجة سعيدة! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/arabic/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/arabic/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..63d3cf0d8 --- /dev/null +++ b/ocr/arabic/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: تعلم كيفية تشغيل OCR على الصورة واستخراج النص مع الإحداثيات باستخدام + التعرف المهيكل على OCR. يتضمن كود بايثون خطوة بخطوة. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: ar +og_description: قم بتشغيل OCR على الصورة واحصل على النص مع الإحداثيات باستخدام التعرف + على OCR الهيكلي. مثال كامل بلغة بايثون مع الشروحات. +og_title: تشغيل OCR على الصورة – دليل استخراج النص المهيكل +tags: +- OCR +- Python +- Computer Vision +title: تشغيل التعرف الضوئي على الأحرف في الصورة – دليل شامل لاستخراج النص المهيكل +url: /ar/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# تشغيل OCR على الصورة – دليل كامل لاستخراج النص المهيكل + +هل احتجت يومًا إلى **تشغيل OCR على ملفات الصورة** لكن لم تكن متأكدًا من كيفية الحفاظ على المواقع الدقيقة لكل كلمة؟ لست وحدك. في العديد من المشاريع—مسح الفواتير، رقمنة النماذج، أو اختبار واجهات المستخدم—تحتاج ليس فقط إلى النص الخام بل أيضًا إلى الصناديق المحيطة التي تخبرك بمكان كل سطر على الصورة. + +يُظهر لك هذا الدرس طريقة عملية لـ *تشغيل OCR على الصورة* باستخدام محرك **aocr**، طلب **التعرف على OCR المهيكل**، ثم معالجة النتيجة مع الحفاظ على الهندسة. بنهاية الدرس ستتمكن من **استخراج النص مع الإحداثيات** في بضع أسطر من بايثون، وستفهم لماذا وضع المهيكلة مهم للمهام اللاحقة. + +## ما ستتعلمه + +- كيفية تهيئة محرك OCR لـ **التعرف على OCR المهيكل**. +- كيفية إمداد الصورة واستلام النتائج الخام التي تشمل حدود السطر. +- كيفية تشغيل معالج لاحق ينظف النص دون فقدان الهندسة. +- كيفية التكرار على السطور النهائية وطباعة كل قطعة نص مع صندوقها المحيط. + +لا سحر، لا خطوات مخفية—فقط مثال كامل قابل للتنفيذ يمكنك إدراجه في مشروعك الخاص. + +--- + +## المتطلبات المسبقة + +قبل أن نبدأ، تأكد من تثبيت ما يلي: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +ستحتاج أيضًا إلى ملف صورة (`input_image.png` أو `.jpg`) يحتوي على نص واضح ومقروء. أي شيء من فاتورة ممسوحة ضوئيًا إلى لقطة شاشة يعمل، طالما أن محرك OCR يستطيع رؤية الأحرف. + +--- + +## الخطوة 1: تهيئة محرك OCR للتعرف المهيكل + +أول شيء نقوم به هو إنشاء نسخة من `aocr.Engine()` وإخبارها أننا نريد **التعرف على OCR المهيكل**. وضع المهيكلة يُعيد ليس فقط النص العادي بل أيضًا البيانات الهندسية (مستطيلات محيطة) لكل سطر، وهو أمر أساسي عندما تحتاج إلى ربط النص بالصورة. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **لماذا هذا مهم:** +> في الوضع الافتراضي قد يعطيك المحرك سلسلة من الكلمات المتصلة فقط. وضع المهيكلة يمنحك هيكلية من صفحات → أسطر → كلمات، كل منها مع إحداثيات، مما يجعل من السهل جدًا وضع النتائج فوق الصورة الأصلية أو تمريرها إلى نموذج واعٍ للتخطيط. + +--- + +## الخطوة 2: تشغيل OCR على الصورة والحصول على النتائج الخام + +الآن نمرر الصورة إلى المحرك. استدعاء `recognize` يُعيد كائن `OcrResult` يحتوي على مجموعة من الأسطر، كل سطر له مستطيله المحيط الخاص. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +في هذه المرحلة `raw_result.lines` يحتوي على كائنات ذات خاصيتين مهمتين: + +- `text` – السلسلة المعترف بها لهذا السطر. +- `bounds` – مجموعة قيم مثل `(x, y, width, height)` تصف موقع السطر. + +--- + +## الخطوة 3: معالجة لاحقة مع الحفاظ على الهندسة + +مخرجات OCR الخام غالبًا ما تكون صاخبة: أحرف عشوائية، مسافات غير صحيحة، أو مشاكل في فواصل الأسطر. الدالة `ai.run_postprocessor` تنظف النص ولكن **تحافظ على الهندسة الأصلية**، لذا لا يزال لديك إحداثيات دقيقة. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **نصيحة احترافية:** إذا كان لديك مفردات خاصة بمجال معين (مثل رموز المنتجات)، قدم قاموسًا مخصصًا للمعالج اللاحق لتحسين الدقة. + +--- + +## الخطوة 4: استخراج النص مع الإحداثيات – التكرار والعرض + +أخيرًا، نمر على الأسطر المنقحة، ونطبع صندوق كل سطر إلى جانب نصه. هذا هو جوهر **استخراج النص مع الإحداثيات**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### النتيجة المتوقعة + +بافتراض أن الصورة المدخلة تحتوي على سطرين: “Invoice #12345” و “Total: $89.99”، ستحصل على شيء مشابه لـ: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +القيمة الأولى هي `(x, y, width, height)` للسطر على الصورة الأصلية، مما يتيح لك رسم المستطيلات، تمييز النص، أو تمرير الإحداثيات إلى نظام آخر. + +--- + +## تصور النتيجة (اختياري) + +إذا أردت رؤية الصناديق المحيطة مضافة إلى الصورة، يمكنك استخدام Pillow (PIL) لرسم المستطيلات. أدناه مقتطف سريع؛ يمكنك تخطيه إذا كنت تحتاج فقط إلى البيانات الخام. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image example showing bounding boxes](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +النص البديل أعلاه يحتوي على **الكلمة المفتاحية الأساسية**، لتلبية متطلبات تحسين محركات البحث لسمات alt في الصور. + +--- + +## لماذا التعرف على OCR المهيكل يتفوق على استخراج النص البسيط + +قد تتساءل، “أليس بإمكاني فقط تشغيل OCR والحصول على النص؟ لماذا أحتاج إلى الهندسة؟” + +- **السياق المكاني:** عندما تحتاج إلى ربط الحقول في نموذج (مثلاً “التاريخ” بجوار قيمة التاريخ)، تُظهر لك الإحداثيات *أين* يقع البيانات. +- **تخطيطات متعددة الأعمدة:** النص الخطي البسيط يفقد الترتيب؛ البيانات المهيكلة تحافظ على ترتيب الأعمدة. +- **دقة المعالجة اللاحقة:** معرفة حجم الصندوق يساعدك على تحديد ما إذا كانت الكلمة عنوانًا، حاشية، أو عنصرًا عشوائيًا. + +باختصار، **التعرف على OCR المهيكل** يمنحك المرونة لبناء خطوط أنابيب أذكى—سواء كنت تُدخل البيانات إلى قاعدة بيانات، تُنشئ ملفات PDF قابلة للبحث، أو تدرب نموذج تعلم آلي يحترم التخطيط. + +--- + +## الحالات الطرفية الشائعة وكيفية التعامل معها + +| الحالة | ما يجب مراقبته | الحل المقترح | +|-----------|-------------------|---------------| +| **صور مائلة أو مائلة** | قد تكون الصناديق المحيطة خارج المحور. | عالج مسبقًا باستخدام تصحيح الميل (مثل `warpAffine` في OpenCV). | +| **خطوط صغيرة جدًا** | قد يفوت المحرك الأحرف، مما يؤدي إلى أسطر فارغة. | زيادة دقة الصورة أو استخدام `ocr_engine.set_dpi(300)`. | +| **لغات مختلطة** | نموذج اللغة الخاطئ قد ينتج نصًا مشوشًا. | ضبط `ocr_engine.language = ["en", "de"]` قبل التعرف. | +| **صناديق متداخلة** | قد يدمج المعالج اللاحق سطرين عن غير قصد. | تحقق من `line.bounds` بعد المعالجة؛ اضبط العتبات في `ai.run_postprocessor`. | + +معالجة هذه السيناريوهات مبكرًا توفر عليك عناءً كبيرًا لاحقًا، خاصةً عندما توسع الحل إلى مئات المستندات يوميًا. + +--- + +## البرنامج الكامل من البداية إلى النهاية + +فيما يلي البرنامج الكامل الجاهز للتنفيذ الذي يجمع جميع الخطوات معًا. انسخه، عدل مسار الصورة، وستكون جاهزًا. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +تشغيل هذا البرنامج سيقوم بـ: + +1. **تشغيل OCR على الصورة** بوضع مهيكل. +2. **استخراج النص مع الإحداثيات** لكل سطر. +3. إنتاج PNG مشروح اختياريًا يُظهر الصناديق. + +--- + +## الخلاصة + +أصبح لديك الآن حل متكامل ومستقل لـ **تشغيل OCR على الصورة** و**استخراج النص مع الإحداثيات** باستخدام **التعرف على OCR المهيكل**. يوضح الكود كل خطوة—من تهيئة المحرك إلى المعالجة اللاحقة والتحقق البصري—حتى تتمكن من تكييفه مع الفواتير، النماذج، أو أي مستند بصري يحتاج إلى تحديد موقع النص بدقة. + +ما الخطوة التالية؟ جرّب استبدال محرك `aocr` بمكتبة أخرى (Tesseract، EasyOCR) وانظر كيف تختلف مخرجاتهما المهيكلة. جرب استراتيجيات معالجة لاحقة مختلفة، مثل التدقيق الإملائي أو مرشحات regex مخصصة، لتعزيز الدقة في مجالك. وإذا كنت تبني خط أنابيب أكبر، فكر في تخزين أزواج `(text, bounds)` في قاعدة بيانات للتحليلات المستقبلية. + +برمجة سعيدة، ولتكن مشاريع OCR دائمًا دقيقة! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/chinese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/chinese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..6c945bbdf --- /dev/null +++ b/ocr/chinese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,228 @@ +--- +category: general +date: 2026-05-03 +description: 使用 Aspose OCR 和 AI 拼写检查从图像中提取文本。学习如何对图像进行 OCR、加载图像进行 OCR、识别发票中的文本并释放 + GPU 资源。 +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: zh +og_description: 使用 Aspose OCR 和 AI 拼写检查从图像中提取文本。一步步指南,涵盖如何对图像进行 OCR、加载图像进行 OCR,以及释放 + GPU 资源。 +og_title: 从图像提取文本 – 完整的 OCR 与拼写检查指南 +tags: +- OCR +- Aspose +- AI +- Python +title: 从图像中提取文本 – Aspose AI 拼写检查的 OCR +url: /zh/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# 从图像提取文本 – 完整 OCR 与拼写检查指南 + +是否曾需要 **从图像提取文本**,却不确定哪个库既快又准?你并不孤单。在许多真实项目中——比如发票处理、收据数字化或合同扫描——从图片中获取干净、可搜索的文本是第一道难关。 + +好消息是,Aspose OCR 搭配轻量级 Aspose AI 模型,只需几行 Python 代码即可完成这项工作。在本教程中,我们将演示 **如何 OCR 图像**、正确加载图片、运行内置的拼写检查后处理器,最后 **释放 GPU 资源**,让你的应用保持内存友好。 + +阅读完本指南后,你将能够 **识别发票图像中的文本**,自动纠正常见的 OCR 错误,并为下一个批次保持 GPU 清洁。 + +--- + +## 你需要准备的环境 + +- Python 3.9 或更高(代码使用类型提示,但在更早的 3.x 版本也可运行) +- `aspose-ocr` 与 `aspose-ai` 包(通过 `pip install aspose-ocr aspose-ai` 安装) +- 可选的 CUDA‑enabled GPU;如果未检测到 GPU,脚本会回退到 CPU +- 示例图片,例如 `sample_invoice.png`,放在可引用的文件夹中 + +无需大型机器学习框架,也不需要庞大的模型下载——只需一个小巧的 Q4‑K‑M 量化模型,能够轻松适配大多数 GPU。 + +--- + +## 第一步:初始化 OCR 引擎 – extract text from image + +首先创建一个 `OcrEngine` 实例,并指定期望的语言。这里我们选择 English 并请求纯文本输出,这对后续处理最为理想。 + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**为什么重要:** 设置语言可以缩小字符集范围,提高识别准确度。纯文本模式会去除布局信息,正好适用于只想 **从图像提取文本** 的场景。 + +--- + +## 第二步:加载图像进行 OCR – how to OCR image + +接下来将实际图片喂给引擎。`Image.load` 辅助函数支持常见格式(PNG、JPEG、TIFF),并抽象了文件 I/O 的细节。 + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**提示:** 如果源图片尺寸较大,建议在送入引擎前先进行缩放;较小的分辨率可以降低 GPU 内存占用,而不会显著影响识别质量。 + +--- + +## 第三步:配置 Aspose AI 模型 – recognize text from invoice + +Aspose AI 附带一个小型 GGUF 模型,可自动下载。示例使用 `Qwen2.5‑3B‑Instruct‑GGUF` 仓库,量化为 `q4_k_m`。我们还指示运行时在 GPU 上分配 20 层,以在速度和显存之间取得平衡。 + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**内部原理:** 量化模型在磁盘上约为 1.5 GB,仅为全精度模型的一小部分,但仍能捕捉足够的语言细微差别,以标记常见的 OCR 拼写错误。 + +--- + +## 第四步:初始化 AsposeAI 并挂载拼写检查后处理器 + +Aspose AI 包含现成的拼写检查后处理器。将其挂载后,所有 OCR 结果都会自动清理。 + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**为什么使用后处理器?** OCR 引擎常把 “Invoice” 误读为 “Invo1ce”,或把 “Total” 误读为 “T0tal”。拼写检查会在原始字符串上运行轻量语言模型,自动纠正这些错误,无需自行编写字典。 + +--- + +## 第五步:在 OCR 结果上运行拼写检查后处理器 + +所有配置就绪后,只需一次调用即可得到校正后的文本。我们同时打印原始文本和清理后的文本,方便对比改进效果。 + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +发票的典型输出可能如下所示: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +可以看到 “Invo1ce” 已被正确纠正为 “Invoice”。这正是内置 AI 拼写检查的威力所在。 + +--- + +## 第六步:安全释放 GPU 资源 – release gpu resources safely + +如果你在长时间运行的服务中使用(例如每分钟处理数十张发票的 Web API),必须在每个批次后释放 GPU 上下文。否则会出现内存泄漏,最终导致 “CUDA out of memory” 错误。 + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**专业技巧:** 将 `free_resources()` 放在 `finally` 块或上下文管理器中,以确保即使出现异常也能执行释放操作。 + +--- + +## 完整示例代码 + +将上述所有片段组合在一起,即可得到一个可直接放入任意项目的独立脚本。 + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +保存文件,修改图片路径,然后运行 `python extract_text_from_image.py`。你应该会在控制台看到已清理的发票文本。 + +--- + +## 常见问题解答 (FAQ) + +**Q: 这在仅有 CPU 的机器上能运行吗?** +A: 完全可以。如果未检测到 GPU,Aspose AI 会回退到 CPU 执行,虽然速度会慢一些。你也可以通过设置 `model_cfg.gpu_layers = 0` 强制使用 CPU。 + +**Q: 如果我的发票使用的不是英文怎么办?** +A: 将 `ocr_engine.language` 改为相应的枚举值(例如 `aocr.Language.Spanish`)。拼写检查模型支持多语言,但使用针对特定语言的模型可能会得到更好效果。 + +**Q: 能否在循环中处理多张图片?** +A: 能。只需将加载、识别和后处理步骤放入 `for` 循环中。若复用同一 AI 实例,记得在循环结束后或每个批次后调用 `ocr_ai.free_resources()`。 + +**Q: 模型下载大小是多少?** +A: 量化的 `q4_k_m` 版本约为 1.5 GB。首次运行后会被缓存,后续执行几乎是瞬时的。 + +--- + +## 结论 + +本教程展示了如何使用 Aspose OCR **从图像提取文本**、配置小型 AI 模型、应用拼写检查后处理器,并安全 **释放 GPU 资源**。整个工作流覆盖了从加载图片到清理资源的全部步骤,为 **recognize text from invoice** 场景提供了可靠的流水线。 + +下一步?尝试将拼写检查替换为自定义的实体抽取模型 + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/chinese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/chinese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..efadc6b82 --- /dev/null +++ b/ocr/chinese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,213 @@ +--- +category: general +date: 2026-05-03 +description: 如何使用 Aspose OCR 与 AI 拼写检查批量处理图像 OCR。学习从图像中提取文本、应用拼写检查、免费使用 AI 资源并纠正 OCR + 错误。 +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: zh +og_description: 如何使用 Aspose OCR 和 AI 拼写检查批量处理图像 OCR。请按照分步指南从图像中提取文本、进行拼写检查、免费使用 AI + 资源并纠正 OCR 错误。 +og_title: 如何使用 Aspose OCR 进行批量 OCR – 完整的 Python 教程 +tags: +- OCR +- Python +- AI +- Aspose +title: 如何使用 Aspose OCR 批量 OCR – 完整 Python 指南 +url: /zh/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# 如何使用 Aspose OCR 批量 OCR – 完整 Python 指南 + +是否曾想过 **如何批量 OCR** 整个文件夹的扫描 PDF 或照片,而无需为每个文件编写单独的脚本?你并不孤单。在许多真实世界的流水线中,你需要 **从图像中提取文本**,清理拼写错误,最后释放你分配的任何 AI 资源。本教程将向你展示如何使用 Aspose OCR、轻量级 AI 后处理器以及几行 Python 来实现这一点。 + +我们将逐步演示如何初始化 OCR 引擎、连接 AI 拼写检查器、遍历图片目录以及随后清理模型。完成后,你将拥有一个可直接运行的脚本,能够自动 **纠正 OCR 错误** 并释放 **免费 AI 资源**,让你的 GPU 保持愉快。 + +## 你需要的环境 + +- Python 3.9+(代码使用类型提示,但在更早的 3.x 版本也能运行) +- `asposeocr` 包(`pip install asposeocr`)– 提供 OCR 引擎。 +- 访问 Hugging Face 模型 `bartowski/Qwen2.5-3B-Instruct-GGUF`(会自动下载)。 +- 至少拥有几 GB VRAM 的 GPU(脚本将 `gpu_layers = 30`,如有需要可降低)。 + +无需外部服务,无需付费 API——所有操作均在本地完成。 + +--- + +## 步骤 1:设置 OCR 引擎 – **如何批量 OCR** 高效实现 + +在处理成千上万张图像之前,我们需要一个可靠的 OCR 引擎。Aspose OCR 让我们能够在一次调用中选择语言和识别模式。 + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**为什么这很重要:** 将 `recognize_mode` 设置为 `Plain` 可保持输出轻量化,这在后续进行拼写检查时非常理想。如果需要布局信息,你可以切换为 `Layout`,但这会增加开销,批处理作业通常不需要。 + +> **专业提示:** 如果处理多语言扫描,你可以传入类似 `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]` 的列表。 + +--- + +## 步骤 2:初始化 AI 后处理器 – **对 OCR 输出应用拼写检查** + +Aspose AI 附带一个内置的后处理器,可以运行任意模型。这里我们从 Hugging Face 拉取一个量化的 Qwen 2.5 模型,并挂接拼写检查例程。 + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**为什么这很重要:** 该模型已量化(`q4_k_m`),大幅降低内存占用,同时仍提供相当的语言理解能力。通过调用 `set_post_processor`,我们告诉 Aspose AI 自动对任何输入字符串执行 **apply spell check** 步骤。 + +> **注意:** 如果你的 GPU 无法处理 30 层,请将数字降至 15 或甚至 5——脚本仍能工作,只是会稍慢一些。 + +--- + +## 步骤 3:对单张图像运行 OCR 并 **纠正 OCR 错误** + +现在 OCR 引擎和 AI 拼写检查器都已准备就绪,我们将它们结合。此函数加载图像,提取原始文本,然后运行 AI 后处理器进行清理。 + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**为什么这很重要:** 直接将原始 OCR 字符串输入 AI 模型即可实现 **correct OCR errors** 处理,无需编写正则或自定义词典。模型了解上下文,能够将 “recieve” 修正为 “receive”,甚至处理更微妙的错误。 + +--- + +## 步骤 4:批量 **从图像中提取文本** – 真正的批处理循环 + +这里正是 **如何批量 OCR** 发挥魔力的地方。我们遍历目录,跳过不支持的文件,并将每个校正后的输出写入 `.txt` 文件。 + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### 预期输出 + +对于包含句子 *“The quick brown fox jumps over the lazzy dog.”* 的图像,你将在文本文件中看到: + +``` +The quick brown fox jumps over the lazy dog. +``` + +请注意,双 “z” 已被自动纠正——这正是 AI 拼写检查的效果。 + +**为什么这很重要:** 只创建一次 OCR 和 AI 对象并重复使用,可避免为每个文件加载模型的开销。这是大规模 **如何批量 OCR** 最高效的方式。 + +--- + +## 步骤 5:清理 – 正确 **释放 AI 资源** + +完成后,调用 `free_resources()` 可释放 GPU 内存、CUDA 上下文以及模型创建的任何临时文件。 + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +跳过此步骤可能导致 GPU 资源悬挂,进而导致后续 Python 进程崩溃或耗尽显存。把它想象成批处理作业的“关灯”环节。 + +--- + +## 常见陷阱与额外提示 + +| Issue | What to Look For | Fix | +|-------|------------------|-----| +| **内存不足错误** | GPU 在处理几十张图像后耗尽 | 降低 `gpu_layers` 或切换到 CPU(`model_cfg.gpu_layers = 0`)。 | +| **缺少语言包** | OCR 返回空字符串 | 确保 `asposeocr` 版本包含英文语言数据;如有需要请重新安装。 | +| **非图像文件** | 脚本在偶然出现的 `.pdf` 文件上崩溃 | `if not file_name.lower().endswith(...)` 检查已跳过这些文件。 | +| **未应用拼写检查** | 输出与原始 OCR 完全相同 | 确认在循环前已调用 `ai_processor.set_post_processor`。 | +| **批处理速度慢** | 每张图像耗时 >5 秒 | 首次运行后设置 `model_cfg.allow_auto_download = "false"`,防止每次都重新下载模型。 | + +**专业提示:** 如果需要在非英文语言中 **从图像中提取文本**,只需将 `ocr_engine.language` 更改为相应的枚举(例如 `aocr.Language.French`)。相同的 AI 后处理器仍会执行拼写检查,但为了获得最佳效果,你可能需要使用特定语言的模型。 + +--- + +## 回顾与后续步骤 + +我们已经完整覆盖了 **如何批量 OCR** 的整个流程: + +1. **初始化** 一个用于英文的纯文本 OCR 引擎。 +2. **配置** AI 拼写检查模型并将其绑定为后处理器。 +3. **运行** OCR 于每张图像,并让 AI 自动 **纠正 OCR 错误**。 +4. **遍历** 目录以批量 **从图像中提取文本**。 +5. 作业完成后 **释放 AI 资源**。 + +从这里你可以: + +- 将校正后的文本传入下游 NLP 流水线(情感分析、实体抽取等)。 +- 通过调用 `ai_processor.set_post_processor(your_custom_func, {})` 将拼写检查后处理器替换为自定义摘要器。 +- 如果 GPU 能够处理多路流,可使用 `concurrent.futures.ThreadPoolExecutor` 并行化文件夹循环。 + +--- + +## 最后思考 + +批量 OCR 并非繁琐的任务。结合 Aspose OCR 与轻量级 AI 模型,你即可获得一个 **一站式解决方案**,能够 **从图像中提取文本**、**应用拼写检查**、**纠正 OCR 错误**,并且 **干净地释放 AI 资源**。在测试文件夹上运行脚本,调整 GPU 层数以匹配你的硬件,你将在几分钟内拥有可投入生产的流水线。 + +对模型调优、处理 PDF 或将其集成到 Web 服务有疑问?在下方留言或在 GitHub 上联系我。祝编码愉快,愿你的 OCR 永远精准! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/chinese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/chinese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..206dafa72 --- /dev/null +++ b/ocr/chinese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,296 @@ +--- +category: general +date: 2026-05-03 +description: Python OCR 教程,展示如何加载 PNG 图像文件、识别图像中的文本以及用于批量 OCR 处理的免费 AI 资源。 +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: zh +og_description: Python OCR 教程将引导您加载 PNG 图像、识别图像中的文本,并处理免费 AI 资源进行批量 OCR 处理。 +og_title: Python OCR 教程——使用免费 AI 资源快速批量 OCR +tags: +- OCR +- Python +- AI +title: Python OCR 教程——轻松实现批量 OCR 处理 +url: /zh/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR 教程 – 批量 OCR 处理轻松实现 + +是否曾经需要一个 **python ocr tutorial**,能够让你在 dozens of PNG files 上运行 OCR 而不至于抓狂?你并不孤单。在许多真实项目中,你必须 **load png image** 文件,喂给 OCR 引擎,然后在完成后清理 AI 资源。 + +在本指南中,我们将逐步演示一个完整、可直接运行的示例,展示如何 **recognize text from image** 文件、批量处理它们,并释放底层 AI 内存。阅读完毕后,你将拥有一个可直接放入任意项目的独立脚本——没有多余的废话,只有必需的核心内容。 + +## 你需要准备的东西 + +- Python 3.10 或更高版本(本示例使用了 f‑strings 和类型提示) +- 一个提供 `engine.recognize` 方法的 OCR 库——演示中我们假设有一个虚构的 `aocr` 包,你也可以替换为 Tesseract、EasyOCR 等。 +- 代码片段中展示的 `ai` 辅助模块(负责模型初始化和资源清理) +- 一个装有待处理 PNG 文件的文件夹 + +如果你没有安装 `aocr` 或 `ai`,可以使用存根(stubs)来模拟——请参见文末的 “Optional Stubs” 部分。 + +## 第一步:初始化 AI 引擎(Free AI Resources) + +在将任何图像送入 OCR 流程之前,底层模型必须已经就绪。只初始化一次可以节省内存并加速批处理任务。 + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**为什么这很重要:** +如果对每张图像都调用 `ai.initialize`,会一次又一次地分配 GPU 内存,最终导致脚本崩溃。通过检查 `ai.is_initialized()`,我们确保只分配一次——这正是 “free AI resources” 原则的核心。 + +## 第二步:加载 PNG 图像文件以进行批量 OCR 处理 + +现在我们收集所有想要进行 OCR 的 PNG 文件。使用 `pathlib` 可以让代码保持跨平台(OS‑agnostic)。 + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**边缘情况:** +如果文件夹中包含非 PNG 文件(例如 JPEG),这些文件会被忽略,从而防止 `engine.recognize` 在不支持的格式上出错。 + +## 第三步:对每张图像运行 OCR 并进行后处理 + +在引擎准备就绪且文件列表已准备好后,我们可以遍历图像,提取原始文本,并交给后处理器清理常见的 OCR 产物(如多余的换行符)。 + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**为何将加载与识别分离:** +`aocr.Image.load` 可能采用惰性解码,对于大批量处理更快。将加载步骤显式化也便于以后如果需要处理 JPEG 或 TIFF 时,直接替换为其他图像库。 + +## 第四步:清理 – 批处理完成后释放 AI 资源 + +批处理结束后,我们必须释放模型,以避免内存泄漏,尤其是在启用 GPU 的机器上。 + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## 综合示例 – 完整脚本 + +下面是一份将上述四个步骤串联起来的单文件脚本。将其保存为 `batch_ocr.py` 并在命令行运行。 + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### 预期输出 + +在包含三个 PNG 的文件夹中运行脚本可能会打印: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +`ocr_results.txt` 文件将为每张图像插入清晰的分隔符,并跟随清理后的 OCR 文本。 + +## 可选存根(Stubs)用于 aocr 与 ai(如果没有真实的包) + +如果你只想在不引入重量级 OCR 库的情况下测试流程,可以创建最小的模拟模块: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +将这些文件夹放在 `batch_ocr.py` 同级目录下,脚本即可运行并打印模拟结果。 + +## 专业技巧与常见坑点 + +- **内存峰值**:如果要处理成千上万的高分辨率 PNG,考虑在 OCR 前先对图像进行缩放。`aocr.Image.load` 通常接受 `max_size` 参数。 +- **Unicode 处理**:始终使用 `encoding="utf-8"` 打开输出文件;OCR 引擎可能会输出非 ASCII 字符。 +- **并行化**:对于 CPU 受限的 OCR,你可以将 `ocr_batch` 包装在 `concurrent.futures.ThreadPoolExecutor` 中。只需记住保持单一的 `ai` 实例——让多个线程各自调用 `ai.initialize` 会违背 “free AI resources” 的目标。 +- **错误容错**:在每张图像的循环中加入 `try/except`,这样单个损坏的 PNG 不会导致整个批处理中止。 + +## 结论 + +现在你拥有了一个 **python ocr tutorial**,演示了如何 **load png image** 文件、执行 **batch OCR processing**,并负责任地管理 **free AI resources**。完整、可运行的示例清晰展示了如何 **recognize text from image** 对象并在后续进行清理,方便你直接复制粘贴到自己的项目中,而无需寻找缺失的代码片段。 + +准备好下一步了吗?尝试将存根的 `aocr` 与 `ai` 模块替换为真实库,如 `pytesseract` 和 `torchvision`。你还可以扩展脚本以输出 JSON、将结果推送到数据库,或集成到云存储桶中。可能性无限——祝编码愉快! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/chinese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/chinese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..ac7b7a932 --- /dev/null +++ b/ocr/chinese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,252 @@ +--- +category: general +date: 2026-05-03 +description: 学习如何在图像上运行 OCR 并使用结构化 OCR 识别提取带坐标的文本。附带逐步的 Python 代码。 +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: zh +og_description: 对图像进行 OCR 识别,并使用结构化 OCR 获取带坐标的文本。完整的 Python 示例并附有解释。 +og_title: 在图像上运行 OCR – 结构化文本提取教程 +tags: +- OCR +- Python +- Computer Vision +title: 对图像进行 OCR – 结构化文本提取完整指南 +url: /zh/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Run OCR on image – 结构化文本提取完整指南 + +是否曾经需要**run OCR on image**文件,却不确定如何保留每个单词的精确位置?你并不孤单。在许多项目中——收据扫描、表单数字化或 UI 测试——你不仅需要原始文本,还需要能够告诉你每行在图片上位置的边界框。 + +本教程将向你展示一种使用 **aocr** 引擎、请求 **structured OCR recognition** 并在保持几何信息的前提下后处理结果的实用方法。完成后,你只需几行 Python 代码即可**extract text with coordinates**,并且会明白结构化模式为何对下游任务如此重要。 + +## 您将学习 + +- 如何为**structured OCR recognition**初始化 OCR 引擎。 +- 如何输入图像并获取包含行边界的原始结果。 +- 如何运行后处理器,在不丢失几何信息的情况下清理文本。 +- 如何遍历最终的行并打印每段文本及其对应的边界框。 + +无需魔法,也没有隐藏步骤——只需一个完整、可运行的示例,直接放入你的项目即可。 + +--- + +## 前置条件 + +在开始之前,请确保已安装以下内容: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +你还需要一张包含清晰可读文字的图像文件(`input_image.png` 或 `.jpg`)。无论是扫描的发票还是截图,只要 OCR 引擎能够识别字符即可。 + +--- + +## 步骤 1:为结构化识别初始化 OCR 引擎 + +首先我们创建 `aocr.Engine()` 的实例,并告知它我们需要 **structured OCR recognition**。结构化模式不仅返回纯文本,还返回每行的几何数据(边界矩形),这在需要将文本映射回图像时至关重要。 + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **为什么这很重要:** +> 在默认模式下,引擎可能只给你一个串联的字符串。结构化模式提供页面 → 行 → 单词的层级结构,每个元素都带有坐标,使得在原始图像上叠加结果或将其输入布局感知模型变得更加容易。 + +--- + +## 步骤 2:对图像运行 OCR 并获取原始结果 + +现在将图像传入引擎。`recognize` 调用返回一个 `OcrResult` 对象,其中包含一系列行,每行都有自己的边界矩形。 + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +此时 `raw_result.lines` 包含的对象具有两个重要属性: + +- `text` – 该行识别出的字符串。 +- `bounds` – 一个类似 `(x, y, width, height)` 的元组,描述该行的位置。 + +--- + +## 步骤 3:在保持几何信息的前提下后处理 + +原始 OCR 输出往往噪声较多:杂散字符、错误的空格或换行问题。`ai.run_postprocessor` 函数会清理文本,但**保持原始几何**不变,这样你仍然拥有准确的坐标。 + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **专业提示:** 如果你有特定领域的词汇表(例如产品代码),可以向后处理器提供自定义字典以提升准确率。 + +--- + +## 步骤 4:提取带坐标的文本 – 遍历并展示 + +最后,我们遍历清理后的行,打印每行的边界框以及对应的文本。这就是**extract text with coordinates**的核心。 + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### 预期输出 + +假设输入图像包含两行文字:“Invoice #12345” 和 “Total: $89.99”,你会看到类似如下的输出: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +第一个元组是原始图像上该行的 `(x, y, width, height)`,你可以据此绘制矩形、突出显示文本,或将坐标传入其他系统。 + +--- + +## 可视化结果(可选) + +如果想在图像上叠加显示边界框,可以使用 Pillow(PIL)绘制矩形。下面是一个快速示例;如果只需要原始数据,可直接跳过。 + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image 示例,显示边界框](/images/ocr-bounding-boxes.png "run OCR on image – 边界框覆盖") + +上面的 alt 文本包含了**primary keyword**,满足图片 alt 属性的 SEO 要求。 + +--- + +## 为什么 Structured OCR Recognition 优于简单文本提取 + +你可能会想,“我只想跑 OCR 获得文本,为什么要在意几何信息?” + +- **空间上下文:** 当你需要在表单上映射字段(例如“Date”旁边的日期值)时,坐标告诉你*数据所在的位置*。 +- **多列布局:** 简单的线性文本会失去顺序;结构化数据保留列顺序。 +- **后处理准确性:** 知道框的大小可以帮助你判断一个词是标题、脚注还是杂散噪声。 + +简而言之,**structured OCR recognition** 为你提供了构建更智能流水线的灵活性——无论是将数据写入数据库、创建可搜索的 PDF,还是训练尊重布局的机器学习模型。 + +--- + +## 常见边缘情况及处理方法 + +| 情况 | 需要注意的点 | 建议的解决方案 | +|-----------|-------------------|---------------| +| **旋转或倾斜的图像** | 边界框可能偏离轴线。 | 使用去倾斜预处理(例如 OpenCV 的 `warpAffine`)。 | +| **字体非常小** | 引擎可能漏掉字符,导致空行。 | 提高图像分辨率或使用 `ocr_engine.set_dpi(300)`。 | +| **混合语言** | 错误的语言模型会导致乱码。 | 在识别前设置 `ocr_engine.language = ["en", "de"]`。 | +| **重叠的框** | 后处理器可能会无意合并两行。 | 在处理后验证 `line.bounds`;在 `ai.run_postprocessor` 中调整阈值。 | + +提前处理这些情况,可在日后扩展到每天数百份文档时避免头疼。 + +--- + +## 完整端到端脚本 + +下面是完整的、可直接运行的程序,整合了所有步骤。复制粘贴,修改图像路径,即可使用。 + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +运行此脚本将: + +1. 使用结构化模式**Run OCR on image**。 +2. 为每行**Extract text with coordinates**。 +3. 可选地生成显示框的标注 PNG。 + +--- + +## 结论 + +现在,你已经拥有一个完整、独立的解决方案,能够**run OCR on image**并**extract text with coordinates**,并使用**structured OCR recognition**。代码演示了从引擎初始化、后处理到可视化验证的每一步,你可以将其应用于收据、表单或任何需要精确文本定位的视觉文档。 + +接下来可以尝试将 `aocr` 引擎替换为其他库(如 Tesseract、EasyOCR),比较它们的结构化输出差异。尝试不同的后处理策略,如拼写检查或自定义正则过滤,以提升特定领域的准确率。如果你在构建更大的流水线,考虑将 `(text, bounds)` 对存入数据库,以便后续分析。 + +祝编码愉快,愿你的 OCR 项目始终精准! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/czech/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/czech/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..a2c3beafb --- /dev/null +++ b/ocr/czech/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,230 @@ +--- +category: general +date: 2026-05-03 +description: extrahovat text z obrázku pomocí Aspose OCR a AI pravopisné kontroly. + Naučte se, jak provést OCR obrázku, načíst obrázek pro OCR, rozpoznat text z faktury + a uvolnit GPU zdroje. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: cs +og_description: extrahujte text z obrázku pomocí Aspose OCR a AI kontroly pravopisu. + Podrobný návod krok za krokem, jak provést OCR obrázku, načíst obrázek pro OCR a + uvolnit zdroje GPU. +og_title: Extrahovat text z obrázku – Kompletní průvodce OCR a kontrolou pravopisu +tags: +- OCR +- Aspose +- AI +- Python +title: Extrahovat text z obrázku – OCR s Aspose AI kontrolou pravopisu +url: /cs/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# extrahovat text z obrázku – Kompletní průvodce OCR a kontrolou pravopisu + +Už jste někdy potřebovali **extrahovat text z obrázku**, ale nebyli jste si jisti, která knihovna vám poskytne jak rychlost, tak přesnost? Nejste v tom sami. V mnoha reálných projektech – například při zpracování faktur, digitalizaci účtenek nebo skenování smluv – získání čistého, prohledávatelného textu z fotografie je první překážka. + +Dobrou zprávou je, že Aspose OCR v kombinaci s lehkým modelem Aspose AI dokáže tuto úlohu zvládnout během několika řádků Pythonu. V tomto tutoriálu si projdeme **jak OCR obrázek**, jak správně načíst obrázek, spustit vestavěný post‑processor pro kontrolu pravopisu a nakonec **uvolnit GPU prostředky**, aby vaše aplikace zůstala šetrná k paměti. + +Na konci tohoto průvodce budete schopni **rozpoznat text z faktur** na obrázcích, automaticky opravit běžné chyby OCR a udržet GPU čisté pro další dávku. + +--- + +## Co budete potřebovat + +- Python 3.9 nebo novější (kód používá typové nápovědy, ale funguje i na starších verzích 3.x) +- balíčky `aspose-ocr` a `aspose-ai` (instalace pomocí `pip install aspose-ocr aspose-ai`) +- GPU s podporou CUDA je volitelný; skript přejde na CPU, pokud žádný GPU nenajde. +- Ukázkový obrázek, např. `sample_invoice.png`, umístěný ve složce, na kterou můžete odkazovat. + +Žádné těžké ML frameworky, žádné masivní stahování modelů – pouze malý Q4‑K‑M kvantovaný model, který pohodlně vejde na většinu GPU. + +--- + +## Krok 1: Inicializace OCR enginu – extrahovat text z obrázku + +Prvním krokem je vytvořit instanci `OcrEngine` a nastavit jazyk, který očekáváte. Zde zvolíme angličtinu a požádáme o výstup ve formátu prostého textu, což je ideální pro následné zpracování. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Proč je to důležité:** Nastavení jazyka omezuje znakovou sadu, čímž se zvyšuje přesnost. Režim prostého textu odstraňuje informace o rozložení, které obvykle nepotřebujete, když jen chcete extrahovat text z obrázku. + +--- + +## Krok 2: Načtení obrázku pro OCR – jak OCR obrázek + +Nyní předáme enginu skutečný obrázek. Pomocná funkce `Image.load` rozumí běžným formátům (PNG, JPEG, TIFF) a abstrahuje nepříjemnosti souborového I/O. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Tip:** Pokud jsou vaše vstupní obrázky velké, zvažte jejich zmenšení před odesláním do enginu; menší rozměry mohou snížit využití GPU paměti, aniž by to ovlivnilo kvalitu rozpoznání. + +--- + +## Krok 3: Konfigurace Aspose AI modelu – rozpoznat text z faktury + +Aspose AI obsahuje malý GGUF model, který lze automaticky stáhnout. Příklad používá repozitář `Qwen2.5‑3B‑Instruct‑GGUF`, kvantovaný na `q4_k_m`. Také říkáme runtime, aby alokoval 20 vrstev na GPU, což vyvažuje rychlost a využití VRAM. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Co se děje pod kapotou:** Kvantovaný model má na disku přibližně 1,5 GB, což je jen zlomek plně přesného modelu, ale stále zachycuje dostatečnou jazykovou nuance pro odhalení typických OCR překlepů. + +--- + +## Krok 4: Inicializace AsposeAI a připojení post‑processoru pro kontrolu pravopisu + +Aspose AI obsahuje připravený post‑processor pro kontrolu pravopisu. Připojením tohoto komponentu bude každý výsledek OCR automaticky vyčištěn. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Proč používat post‑processor?** OCR enginy často přečtou „Invoice“ jako „Invo1ce“ nebo „Total“ jako „T0tal“. Kontrola pravopisu spustí lehký jazykový model nad surovým řetězcem a opraví tyto chyby, aniž byste museli psát vlastní slovník. + +--- + +## Krok 5: Spuštění post‑processoru pro kontrolu pravopisu na výsledku OCR + +Po propojení všech částí jediným voláním získáte opravený text. Také vypíšeme jak originální, tak vyčištěnou verzi, abyste viděli rozdíl. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Typický výstup pro fakturu může vypadat takto: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Všimněte si, jak se „Invo1ce“ změnilo na správné slovo „Invoice“. To je síla vestavěné AI kontroly pravopisu. + +--- + +## Krok 6: Uvolnění GPU prostředků – bezpečně uvolnit GPU zdroje + +Pokud tento skript spouštíte v dlouho běžící službě (např. webovém API, které zpracovává desítky faktur za minutu), musíte po každé dávce uvolnit GPU kontext. Jinak se objeví úniky paměti a nakonec chyby „CUDA out of memory“. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Profesionální tip:** Zavolejte `free_resources()` uvnitř `finally` bloku nebo kontextového manažera, aby se vždy provedlo, i když dojde k výjimce. + +--- + +## Kompletní funkční příklad + +Sestavením všech částí získáte samostatný skript, který můžete vložit do libovolného projektu. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Uložte soubor, upravte cestu k vašemu obrázku a spusťte `python extract_text_from_image.py`. Měli byste vidět vyčištěný text faktury vytištěný v konzoli. + +--- + +## Často kladené otázky (FAQ) + +**Q: Funguje to i na strojích bez GPU?** +A: Rozhodně. Pokud není detekován žádný GPU, Aspose AI přejde na CPU, i když bude pomalejší. CPU můžete vynutit nastavením `model_cfg.gpu_layers = 0`. + +**Q: Co když jsou mé faktury v jiném jazyce než angličtině?** +A: Změňte `ocr_engine.language` na odpovídající enum hodnotu (např. `aocr.Language.Spanish`). Model pro kontrolu pravopisu je vícejazykový, ale můžete dosáhnout lepších výsledků s jazykově specifickým modelem. + +**Q: Můžu zpracovávat více obrázků ve smyčce?** +A: Ano. Stačí přesunout kroky načítání, rozpoznání a post‑processingu do `for` smyčky. Nezapomeňte po smyčce nebo po každé dávce zavolat `ocr_ai.free_resources()`, pokud používáte stejnou AI instanci. + +**Q: Jak velké je stažení modelu?** +A: Přibližně 1,5 GB pro kvantovanou verzi `q4_k_m`. Po prvním spuštění se model uloží do cache, takže následná spuštění jsou okamžitá. + +--- + +## Závěr + +V tomto tutoriálu jsme ukázali, jak **extrahovat text z obrázku** pomocí Aspose OCR, nakonfigurovat malý AI model, aplikovat post‑processor pro kontrolu pravopisu a bezpečně **uvolnit GPU prostředky**. Pracovní postup pokrývá vše od načtení obrázku až po úklid po sobě, což vám poskytuje spolehlivý pipeline pro scénáře **rozpoznat text z faktury**. + +Další kroky? Vyzkoušejte výměnu kontroly pravopisu za vlastní model pro extrakci entit + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/czech/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/czech/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..dd60eb70f --- /dev/null +++ b/ocr/czech/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,215 @@ +--- +category: general +date: 2026-05-03 +description: Jak dávkově provádět OCR obrázků pomocí Aspose OCR a AI pravopisné kontroly. + Naučte se extrahovat text z obrázků, použít pravopisnou kontrolu, využít bezplatné + AI zdroje a opravit chyby OCR. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: cs +og_description: Jak dávkově provádět OCR obrázků pomocí Aspose OCR a AI pravopisné + kontroly. Postupujte podle krok‑za‑krokem průvodce k extrakci textu z obrázků, aplikaci + pravopisné kontroly, využití bezplatných AI zdrojů a opravě chyb OCR. +og_title: Jak provádět dávkové OCR pomocí Aspose OCR – Kompletní tutoriál v Pythonu +tags: +- OCR +- Python +- AI +- Aspose +title: Jak provádět dávkové OCR pomocí Aspose OCR – Kompletní průvodce v Pythonu +url: /cs/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Jak provádět hromadné OCR pomocí Aspose OCR – Kompletní průvodce v Pythonu + +Už jste se někdy zamýšleli **jak provádět hromadné OCR** na celou složku naskenovaných PDF nebo fotografií, aniž byste museli psát samostatný skript pro každý soubor? Nejste v tom sami. V mnoha reálných pipelinech budete potřebovat **extrahovat text z obrázků**, opravit pravopisné chyby a nakonec uvolnit jakékoli AI zdroje, které jste alokovali. Tento tutoriál vám přesně ukáže, jak to provést pomocí Aspose OCR, lehkého AI post‑processoru, a několika řádků Pythonu. + +Provedeme vás inicializací OCR enginu, napojením AI pravopisného kontroléru, procházením adresáře s obrázky a následným vyčištěním modelu. Na konci budete mít připravený skript, který **automaticky opravuje OCR chyby** a uvolňuje **volné AI zdroje**, takže vaše GPU zůstane spokojená. + +## Co budete potřebovat + +- Python 3.9+ (kód používá type‑hints, ale funguje i na starších verzích 3.x) +- `asposeocr` package (`pip install asposeocr`) – poskytuje OCR engine. +- Přístup k modelu Hugging Face `bartowski/Qwen2.5-3B-Instruct-GGUF` (stáhne se automaticky). +- GPU s alespoň několika GB VRAM (skript nastavuje `gpu_layers = 30`, můžete to snížit, pokud je potřeba). + +Žádné externí služby, žádné placené API – vše běží lokálně. + +--- + +## Krok 1: Nastavení OCR enginu – **Jak provádět hromadné OCR** efektivně + +Než budeme moci zpracovat tisíc obrázků, potřebujeme spolehlivý OCR engine. Aspose OCR nám umožňuje vybrat jazyk a režim rozpoznávání jedním voláním. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Proč je to důležité:** Nastavení `recognize_mode` na `Plain` udržuje výstup lehký, což je ideální, pokud později plánujete spustit pravopisnou kontrolu. Pokud byste potřebovali informace o rozložení, přepnuli byste na `Layout`, ale to přidává režii, kterou v hromadném úkolu pravděpodobně nebudete chtít. + +> **Tip:** Pokud pracujete s vícejazyčnými skeny, můžete předat seznam jako `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Krok 2: Inicializace AI post‑processoru – **Aplikovat pravopisnou kontrolu** na OCR výstup + +Aspose AI přichází s vestavěným post‑procesorem, který může spouštět libovolný model. Zde načteme kvantizovaný model Qwen 2.5 z Hugging Face a připojíme rutinu pravopisné kontroly. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Proč je to důležité:** Model je kvantizovaný (`q4_k_m`), což výrazně snižuje spotřebu paměti a přitom poskytuje slušné porozumění jazyku. Voláním `set_post_processor` říkáme Aspose AI, aby automaticky spustil krok **apply spell check** na jakýkoli řetězec, který mu předáme. + +> **Pozor:** Pokud vaše GPU nedokáže zvládnout 30 vrstev, snižte číslo na 15 nebo dokonce 5 – skript bude stále fungovat, jen trochu pomaleji. + +--- + +## Krok 3: Spuštění OCR a **oprava OCR chyb** na jednom obrázku + +Nyní, když jsou OCR engine i AI pravopisná kontrola připravené, spojíme je. Tato funkce načte obrázek, extrahuje surový text a poté spustí AI post‑processor k jeho vyčištění. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Proč je to důležité:** Přímé předání surového OCR řetězce do AI modelu nám poskytuje průchod **correct OCR errors** bez nutnosti psát regexy nebo vlastní slovníky. Model rozumí kontextu, takže může opravit „recieve“ → „receive“ a i subtilnější chyby. + +--- + +## Krok 4: **Extrahovat text z obrázků** hromadně – Skutečná smyčka pro batch + +Zde se ukáže kouzlo **jak provádět hromadné OCR**. Procházíme adresář, přeskočíme nepodporované soubory a zapíšeme každý opravený výstup do souboru `.txt`. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Očekávaný výstup + +Pro obrázek obsahující větu *„The quick brown fox jumps over the lazzy dog.“* uvidíte textový soubor s: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Všimněte si, že dvojité „z“ bylo automaticky opraveno – to je AI pravopisná kontrola v akci. + +**Proč je to důležité:** Vytvořením OCR a AI objektů **jednou** a jejich opakovaným použitím se vyhneme režii načítání modelu pro každý soubor. To je nejefektivnější způsob, jak **provádět hromadné OCR** ve velkém měřítku. + +--- + +## Krok 5: Vyčištění – **Uvolnit AI zdroje** správně + +Po dokončení volání `free_resources()` uvolní GPU paměť, CUDA kontexty a všechny dočasné soubory, které model vytvořil. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Přeskočení tohoto kroku může zanechat visící alokace GPU, což může způsobit pád následných Python procesů nebo spotřebovat VRAM. Považujte to za část „vypnout světla“ v batch úkolu. + +--- + +## Časté problémy a další tipy + +| Problém | Co sledovat | Řešení | +|-------|------------------|-----| +| **Chyby nedostatku paměti** | GPU se vyčerpá po několika desítkách obrázků | Snižte `gpu_layers` nebo přepněte na CPU (`model_cfg.gpu_layers = 0`). | +| **Chybějící jazykový balíček** | OCR vrací prázdné řetězce | Ujistěte se, že verze `asposeocr` obsahuje data pro anglický jazyk; v případě potřeby přeinstalujte. | +| **Soubory, které nejsou obrázky** | Skript spadne při náhodném `.pdf` | Ochrana `if not file_name.lower().endswith(...)` je již nastavená, aby je přeskočila. | +| **Pravopisná kontrola nebyla aplikována** | Výstup vypadá identicky jako surový OCR | Ověřte, že `ai_processor.set_post_processor` byl zavolán před smyčkou. | +| **Pomalá rychlost batch** | Trvá >5 sekund na obrázek | Povolte `model_cfg.allow_auto_download = "false"` po prvním spuštění, aby se model nestahoval pokaždé znovu. | + +**Tip:** Pokud potřebujete **extrahovat text z obrázků** v jiném jazyce než angličtině, jednoduše změňte `ocr_engine.language` na odpovídající enum (např. `aocr.Language.French`). Stejný AI post‑processor bude i nadále aplikovat pravopisnou kontrolu, ale pro nejlepší výsledky můžete chtít jazykově specifický model. + +--- + +## Shrnutí a další kroky + +Probrali jsme celý pipeline pro **jak provádět hromadné OCR**: + +1. **Inicializujte** OCR engine pro plain‑text v angličtině. +2. **Nakonfigurujte** AI model pro pravopisnou kontrolu a připojte jej jako post‑processor. +3. **Spusťte** OCR na každém obrázku a nechte AI **automaticky opravit OCR chyby**. +4. **Projděte** adresář a **extrahujte text z obrázků** hromadně. +5. **Uvolněte AI zdroje** po dokončení úkolu. + +Odtud můžete: + +- Přesměrovat opravený text do následného NLP pipeline (analýza sentimentu, extrakce entit, atd.). +- Vyměnit post‑processor pravopisné kontroly za vlastní sumarizátor voláním `ai_processor.set_post_processor(your_custom_func, {})`. +- Paralelizovat smyčku přes složku pomocí `concurrent.futures.ThreadPoolExecutor`, pokud vaše GPU zvládne více streamů. + +--- + +## Závěrečné úvahy + +Hromadné OCR nemusí být obtížné. Využitím Aspose OCR spolu s lehkým AI modelem získáte **komplexní řešení**, které **extrahuje text z obrázků**, **aplikuje pravopisnou kontrolu**, **opravuje OCR chyby** a **čistě uvolňuje AI zdroje**. Vyzkoušejte skript na testovací složce, upravte počet GPU vrstev podle vašeho hardwaru a během několika minut budete mít připravený pipeline pro produkci. + +Máte otázky ohledně ladění modelu, práce s PDF nebo integrace do webové služby? Zanechte komentář níže nebo mě kontaktujte na GitHubu. Šťastné kódování a ať je vaše OCR vždy přesné! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/czech/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/czech/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..b4c4747a9 --- /dev/null +++ b/ocr/czech/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,298 @@ +--- +category: general +date: 2026-05-03 +description: Python OCR tutoriál, který ukazuje, jak načíst PNG soubory obrázků, rozpoznat + text z obrázku a využít bezplatné AI zdroje pro dávkové zpracování OCR. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: cs +og_description: Python OCR tutoriál vás provede načítáním PNG obrázků, rozpoznáváním + textu z obrázku a využíváním bezplatných AI zdrojů pro hromadné zpracování OCR. +og_title: Python OCR návod – Rychlý hromadný OCR s volnými AI zdroji +tags: +- OCR +- Python +- AI +title: Python OCR tutoriál – Hromadné zpracování OCR snadno +url: /cs/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR tutoriál – Snadné zpracování OCR po dávkách + +Už jste někdy potřebovali **python ocr tutorial**, který vám skutečně umožní spustit OCR na desítkách PNG souborů, aniž byste si trhali vlasy? Nejste sami. V mnoha reálných projektech musíte **load png image** soubory, předat je enginu a pak po dokončení uvolnit AI zdroje. + +V tomto průvodci projdeme kompletním, připraveným příkladem, který ukazuje přesně, jak **recognize text from image** soubory, zpracovat je po dávkách a uvolnit podkladovou AI paměť. Na konci budete mít samostatný skript, který můžete vložit do libovolného projektu – žádné zbytečnosti, jen podstata. + +## Co budete potřebovat + +- Python 3.10 nebo novější (syntaxe zde používá f‑stringy a typové nápovědy) +- OCR knihovnu, která poskytuje metodu `engine.recognize` – pro demonstraci předpokládáme fiktivní balíček `aocr`, ale můžete nahradit Tesseract, EasyOCR atd. +- Modul `ai` z ukázky kódu (spravuje inicializaci modelu a úklid zdrojů) +- Složku plnou PNG souborů, které chcete zpracovat + +Pokud nemáte nainstalované `aocr` nebo `ai`, můžete je napodobit pomocí stubů – viz sekce „Optional Stubs“ na konci. + +## Krok 1: Inicializace AI enginu (Free AI Resources) + +Než pošlete jakýkoli obrázek do OCR pipeline, podkladový model musí být připraven. Jednorázová inicializace šetří paměť a zrychluje dávkové úlohy. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Proč je to důležité:** +Volání `ai.initialize` opakovaně pro každý obrázek by alokovalo GPU paměť znovu a znovu, což nakonec skript zhavaruje. Kontrolou `ai.is_initialized()` garantujeme jedinou alokaci – to je princip „free AI resources“. + +## Krok 2: Načtení PNG souborů pro dávkové OCR zpracování + +Nyní shromáždíme všechny PNG soubory, které chceme spustit přes OCR. Použití `pathlib` udržuje kód nezávislý na OS. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Hraniční případ:** +Pokud složka obsahuje soubory, které nejsou PNG (např. JPEG), budou ignorovány, čímž zabráníme tomu, aby `engine.recognize` selhal na nepodporovaném formátu. + +## Krok 3: Spuštění OCR na každém obrázku a aplikace post‑processingu + +S připraveným enginem a seznamem souborů můžeme iterovat přes obrázky, získat surový text a předat ho post‑processoru, který vyčistí běžné OCR artefakty (jako jsou zbytečné zalomení řádků). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Proč oddělujeme načítání od rozpoznání:** +`aocr.Image.load` může provádět líné dekódování, což je rychlejší pro velké dávky. Explicitní krok načítání také usnadňuje výměnu za jinou knihovnu, pokud později potřebujete zpracovávat JPEG nebo TIFF soubory. + +## Krok 4: Úklid – uvolnění AI zdrojů po dokončení dávky + +Po dokončení dávky musíme model uvolnit, aby nedocházelo k únikům paměti, zejména na strojích s GPU. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Spojení všeho dohromady – kompletní skript + +Níže je jeden soubor, který propojí čtyři kroky do koherentního workflow. Uložte jej jako `batch_ocr.py` a spusťte z příkazové řádky. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Očekávaný výstup + +Spuštění skriptu nad složkou obsahující tři PNG může vypsat: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +Soubor `ocr_results.txt` bude obsahovat jasný oddělovač pro každý obrázek následovaný vyčištěným OCR textem. + +## Volitelné stuby pro aocr & ai (pokud nemáte skutečné balíčky) + +Pokud chcete jen otestovat tok bez těžkých OCR knihoven, můžete vytvořit minimální mock moduly: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Umístěte tyto složky vedle `batch_ocr.py` a skript poběží, vypisujíc mock výsledky. + +## Pro tipy & časté úskalí + +- **Špičky paměti:** Pokud zpracováváte tisíce vysokého rozlišení PNG, zvažte jejich zmenšení před OCR. `aocr.Image.load` často přijímá argument `max_size`. +- **Zpracování Unicode:** Vždy otevírejte výstupní soubor s `encoding="utf-8"`; OCR enginy mohou generovat ne‑ASCII znaky. +- **Paralelizace:** Pro CPU‑bound OCR můžete obalit `ocr_batch` do `concurrent.futures.ThreadPoolExecutor`. Jen nezapomeňte udržet jedinou instanci `ai` – spouštění mnoha vláken, které každé volají `ai.initialize`, podkopává cíl „free AI resources“. +- **Odolnost vůči chybám:** Zabalte smyčku přes obrázky do `try/except`, aby jeden poškozený PNG neukončil celou dávku. + +## Závěr + +Nyní máte **python ocr tutorial**, který demonstruje, jak **load png image** soubory, provést **batch OCR processing** a zodpovědně spravovat **free AI resources**. Kompletní, spustitelný příklad ukazuje přesně, jak **recognize text from image** objekty a následně úklid, takže jej můžete zkopírovat do vlastních projektů bez hledání chybějících částí. + +Jste připraveni na další krok? Vyzkoušejte výměnu stubovaných modulů `aocr` a `ai` za skutečné knihovny jako `pytesseract` a `torchvision`. Můžete také rozšířit skript o výstup do JSON, odesílání výsledků do databáze nebo integraci s cloudovým úložištěm. Možnosti jsou neomezené – šťastné kódování! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/czech/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/czech/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..0b12c4187 --- /dev/null +++ b/ocr/czech/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,236 @@ +--- +category: general +date: 2026-05-03 +description: Naučte se, jak spustit OCR na obrázku a extrahovat text s koordináty + pomocí strukturovaného rozpoznávání OCR. Krok za krokem je zahrnutý Python kód. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: cs +og_description: Spusťte OCR na obrázku a získejte text s koordináty pomocí strukturovaného + rozpoznávání OCR. Kompletní příklad v Pythonu s vysvětlením. +og_title: Spusťte OCR na obrázku – Návod na extrakci strukturovaného textu +tags: +- OCR +- Python +- Computer Vision +title: Spusťte OCR na obrázku – Kompletní průvodce extrakcí strukturovaného textu +url: /cs/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Spusťte OCR na obrázku – Kompletní průvodce extrakcí strukturovaného textu + +Už jste někdy potřebovali **run OCR on image** soubory, ale nebyli jste si jisti, jak zachovat přesné pozice každého slova? Nejste v tom sami. V mnoha projektech—skenování účtenek, digitalizace formulářů nebo testování UI—potřebujete nejen surový text, ale také ohraničující rámečky, které vám ukazují, kde se každá řádka nachází na obrázku. + +Tento tutoriál vám ukáže praktický způsob, jak *run OCR on image* pomocí **aocr** enginu, požádat o **structured OCR recognition**, a poté provést post‑processing výsledku při zachování geometrie. Na konci budete schopni **extract text with coordinates** během několika řádků Pythonu a pochopíte, proč je strukturovaný režim důležitý pro následné úkoly. + +## Co se naučíte + +- Jak inicializovat OCR engine pro **structured OCR recognition**. +- Jak načíst obrázek a získat surové výsledky, které obsahují ohraničení řádků. +- Jak spustit post‑processor, který vyčistí text, aniž by ztratil geometrické informace. +- Jak iterovat přes finální řádky a vytisknout každý kus textu spolu s jeho ohraničujícím rámečkem. + +Žádná magie, žádné skryté kroky—jen kompletní, spustitelný příklad, který můžete vložit do svého projektu. + +--- + +## Požadavky + +Než se ponoříme dál, ujistěte se, že máte nainstalováno následující: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Budete také potřebovat soubor s obrázkem (`input_image.png` nebo `.jpg`), který obsahuje jasný, čitelný text. Cokoliv od naskenované faktury po snímek obrazovky funguje, pokud OCR engine dokáže vidět znaky. + +## Krok 1: Inicializace OCR engine pro strukturované rozpoznávání + +Prvním krokem je vytvořit instanci `aocr.Engine()` a říct jí, že chceme **structured OCR recognition**. Strukturovaný režim vrací nejen čistý text, ale také geometrická data (ohraničující obdélníky) pro každou řádku, což je nezbytné, když potřebujete mapovat text zpět na obrázek. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Proč je to důležité:** +> V výchozím režimu může engine poskytnout jen řetězec spojených slov. Strukturovaný režim vám dává hierarchii stránek → řádků → slov, každé s koordináty, což výrazně usnadňuje překrytí výsledků na původní obrázek nebo jejich předání modelu citlivému na rozvržení. + +## Krok 2: Spusťte OCR na obrázku a získejte surové výsledky + +Nyní načteme obrázek do engine. Volání `recognize` vrací objekt `OcrResult`, který obsahuje kolekci řádků, z nichž každý má své vlastní ohraničující obdélníky. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +V tomto okamžiku `raw_result.lines` obsahuje objekty se dvěma důležitými atributy: + +- `text` – rozpoznaný řetězec pro tuto řádku. +- `bounds` – n-tici jako `(x, y, width, height)` popisující pozici řádky. + +## Krok 3: Post‑processing při zachování geometrie + +Surový výstup OCR je často šumivý: cizí znaky, špatně umístěné mezery nebo problémy s konci řádků. Funkce `ai.run_postprocessor` vyčistí text, ale **zachová původní geometrii** nedotčenou, takže stále máte přesné souřadnice. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Tip:** Pokud máte doménově specifické slovníky (např. kódy produktů), předávejte post‑processoru vlastní slovník pro zlepšení přesnosti. + +## Krok 4: Extrahujte text se souřadnicemi – iterujte a zobrazte + +Nakonec projdeme vyčištěné řádky a vytiskneme ohraničující rámeček každé řádky spolu s jejím textem. Toto je jádro **extract text with coordinates**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Očekávaný výstup + +Předpokládejme, že vstupní obrázek obsahuje dvě řádky: “Invoice #12345” a “Total: $89.99”, uvidíte něco jako: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +První n-tice je `(x, y, width, height)` řádky na původním obrázku, což vám umožní kreslit obdélníky, zvýraznit text nebo předat souřadnice do jiného systému. + +## Vizualizace výsledku (volitelné) + +Pokud chcete vidět ohraničující rámečky překryté na obrázku, můžete použít Pillow (PIL) k vykreslení obdélníků. Níže je rychlý úryvek; klidně jej přeskočte, pokud potřebujete jen surová data. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![příklad spuštění OCR na obrázku zobrazující ohraničující rámečky](/images/ocr-bounding-boxes.png "spuštění OCR na obrázku – překrytí ohraničujících rámečků") + +Výše uvedený alt text obsahuje **primary keyword**, což splňuje SEO požadavek na atributy alt obrázku. + +## Proč strukturované rozpoznávání OCR převažuje nad jednoduchým extrahováním textu + +Možná se ptáte: „Nemohu jen spustit OCR a získat text? Proč se trápit s geometrií?“ + +- **Prostorový kontext:** Když potřebujete mapovat pole na formuláři (např. „Date“ vedle hodnoty data), souřadnice vám říkají *kde* data jsou. +- **Více‑sloupcové rozvržení:** Jednoduchý lineární text ztrácí pořadí; strukturovaná data zachovávají pořadí sloupců. +- **Přesnost post‑processingu:** Znalost velikosti rámečku vám pomáhá rozhodnout, zda je slovo nadpis, poznámka pod čarou nebo cizí artefakt. + +Stručně řečeno, **structured OCR recognition** vám poskytuje flexibilitu pro tvorbu chytřejších pipeline—ať už vkládáte data do databáze, vytváříte prohledávatelné PDF nebo trénujete model strojového učení, který respektuje rozvržení. + +## Běžné okrajové případy a jak je řešit + +| Situace | Na co si dát pozor | Navrhované řešení | +|-----------|-------------------|---------------| +| **Otočené nebo zkosené obrázky** | Ohraničující rámečky mohou být mimo osu. | Předzpracujte deskewingem (např. `warpAffine` z OpenCV). | +| **Velmi malé fonty** | Engine může postrádat znaky, což vede k prázdným řádkům. | Zvyšte rozlišení obrázku nebo použijte `ocr_engine.set_dpi(300)`. | +| **Smíšené jazyky** | Špatný jazykový model může způsobit poškozený text. | Nastavte `ocr_engine.language = ["en", "de"]` před rozpoznáním. | +| **Překrývající se rámečky** | Post‑processor může neúmyslně sloučit dva řádky. | Ověřte `line.bounds` po zpracování; upravte prahy v `ai.run_postprocessor`. | + +Řešení těchto scénářů včas vám ušetří pozdější problémy, zejména když škálujete řešení na stovky dokumentů denně. + +## Kompletní skript od začátku do konce + +Níže je kompletní, připravený ke spuštění program, který spojuje všechny kroky. Zkopírujte, upravte cestu k obrázku a můžete spustit. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Spuštění tohoto skriptu provede: + +1. **Run OCR on image** ve strukturovaném režimu. +2. **Extract text with coordinates** pro každou řádku. +3. Volitelně vytvoří anotovaný PNG zobrazující rámečky. + +## Závěr + +Nyní máte robustní, samostatné řešení pro **run OCR on image** a **extract text with coordinates** pomocí **structured OCR recognition**. Kód demonstruje každý krok—od inicializace engine po post‑processing a vizuální ověření—takže jej můžete přizpůsobit účtenkám, formulářům nebo jakémukoli vizuálnímu dokumentu, který potřebuje přesnou lokalizaci textu. + +Co dál? Zkuste vyměnit engine `aocr` za jinou knihovnu (Tesseract, EasyOCR) a podívejte se, jak se liší jejich strukturované výstupy. Experimentujte s různými strategiemi post‑processingu, jako je kontrola pravopisu nebo vlastní regex filtry, abyste zvýšili přesnost pro vaši doménu. A pokud budujete větší pipeline, zvažte uložení dvojic `(text, bounds)` do databáze pro pozdější analýzu. + +Šťastné kódování a ať jsou vaše OCR projekty vždy přesné! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/dutch/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/dutch/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..678badcaa --- /dev/null +++ b/ocr/dutch/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,212 @@ +--- +category: general +date: 2026-05-03 +description: tekst extraheren uit afbeelding met Aspose OCR en AI-spellingscontrole. + Leer hoe je een afbeelding OCR't, afbeelding laadt voor OCR, tekst van factuur herkent + en GPU‑resources vrijgeeft. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: nl +og_description: tekst uit afbeelding extraheren met Aspose OCR en AI-spellingscontrole. + Stapsgewijze handleiding die behandelt hoe je een afbeelding OCR't, afbeelding laadt + voor OCR, en GPU‑resources vrijgeeft. +og_title: tekst uit afbeelding extraheren – Complete OCR‑ en spellingscontrolegids +tags: +- OCR +- Aspose +- AI +- Python +title: tekst uit afbeelding extraheren – OCR met Aspose AI Spell‑Check +url: /nl/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# tekst uit afbeelding extraheren – Complete OCR- en spell‑checkgids + +Heb je ooit **tekst uit afbeelding** moeten extraheren maar wist je niet welke bibliotheek zowel snelheid als nauwkeurigheid biedt? Je bent niet de enige. In veel real‑world projecten—denk aan factuurverwerking, kassabon digitalisatie, of het scannen van contracten—het verkrijgen van schone, doorzoekbare tekst uit een foto is de eerste hindernis. + +Het goede nieuws is dat Aspose OCR in combinatie met een lichtgewicht Aspose AI‑model die taak kan afhandelen in een paar regels Python. In deze tutorial lopen we door **hoe je een afbeelding OCR't**, de afbeelding correct laadt, een ingebouwde spell‑check‑post‑processor uitvoert, en uiteindelijk **GPU‑bronnen vrijgeeft** zodat je app geheugen‑vriendelijk blijft. + +Aan het einde van deze gids kun je **tekst van facturen** herkennen, veelvoorkomende OCR‑fouten automatisch corrigeren, en je GPU schoon houden voor de volgende batch. + +--- + +## Wat je nodig hebt + +- Python 3.9 of nieuwer (de code gebruikt type‑hints maar werkt op eerdere 3.x‑versies) +- `aspose-ocr` en `aspose-ai` pakketten (installeren via `pip install aspose-ocr aspose-ai`) +- Een CUDA‑enabled GPU is optioneel; het script valt terug op CPU als er geen wordt gevonden. +- Een voorbeeldafbeelding, bijv. `sample_invoice.png`, geplaatst in een map die je kunt refereren. + +Geen zware ML‑frameworks, geen enorme modeldownloads—alleen een klein Q4‑K‑M gekwantiseerd model dat comfortabel op de meeste GPU's past. + +## Stap 1: Initialiseer de OCR‑engine – tekst uit afbeelding extraheren + +Het eerste wat je doet is een `OcrEngine`‑instantie maken en aangeven welke taal je verwacht. Hier kiezen we Engels en vragen om plain‑text output, wat ideaal is voor downstream verwerking. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Waarom dit belangrijk is:** Het instellen van de taal beperkt de tekenset, waardoor de nauwkeurigheid verbetert. De plain‑text modus verwijdert lay‑outinformatie die je meestal niet nodig hebt wanneer je alleen tekst uit afbeelding wilt extraheren. + +## Stap 2: Afbeelding laden voor OCR – hoe een afbeelding OCR't + +Nu voeren we een echte afbeelding aan de engine. De `Image.load`‑helper begrijpt gangbare formaten (PNG, JPEG, TIFF) en abstraheert de eigenaardigheden van bestands‑IO. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Tip:** Als je bronafbeeldingen groot zijn, overweeg ze te verkleinen voordat je ze naar de engine stuurt; kleinere afmetingen kunnen GPU‑geheugengebruik verminderen zonder de herkenningskwaliteit te schaden. + +## Stap 3: Configureer het Aspose AI‑model – tekst van factuur herkennen + +Aspose AI wordt geleverd met een klein GGUF‑model dat je automatisch kunt downloaden. Het voorbeeld gebruikt de `Qwen2.5‑3B‑Instruct‑GGUF`‑repository, gekwantiseerd naar `q4_k_m`. We geven de runtime ook de opdracht om 20 lagen op de GPU toe te wijzen, wat snelheid en VRAM‑gebruik in balans brengt. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Achter de schermen:** Het gekwantiseerde model is ongeveer 1,5 GB op schijf, een fractie van een full‑precision model, maar het vangt toch genoeg linguïstische nuances om typische OCR‑spelfouten te signaleren. + +## Stap 4: Initialiseer AsposeAI en koppel de spell‑check post‑processor + +Aspose AI bevat een kant‑en‑klare spell‑check post‑processor. Door deze te koppelen wordt elk OCR‑resultaat automatisch opgeschoond. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Waarom de post‑processor gebruiken?** OCR‑engines lezen vaak “Invoice” als “Invo1ce” of “Total” als “T0tal”. De spell‑check voert een lichtgewicht taalmodel uit over de ruwe string en corrigeert die fouten zonder dat je een aangepast woordenboek hoeft te schrijven. + +## Stap 5: Voer de spell‑check post‑processor uit op het OCR‑resultaat + +Met alles aangesloten levert één enkele aanroep de gecorrigeerde tekst op. We printen ook zowel de originele als de opgeschoonde versie zodat je de verbetering kunt zien. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Typische output voor een factuur kan er als volgt uitzien: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Let op hoe “Invo1ce” is omgezet naar het juiste woord “Invoice”. Dat is de kracht van de ingebouwde AI‑spell‑check. + +## Stap 6: GPU‑bronnen vrijgeven – GPU‑bronnen veilig vrijgeven + +Als je dit draait in een langdurige service (bijv. een web‑API die tientallen facturen per minuut verwerkt), moet je de GPU‑context na elke batch vrijgeven. Anders zie je geheugenlekken en krijg je uiteindelijk “CUDA out of memory” fouten. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Pro tip:** Roep `free_resources()` aan binnen een `finally`‑blok of een context‑manager zodat het altijd wordt uitgevoerd, zelfs als er een uitzondering optreedt. + +## Volledig werkend voorbeeld + +Alle onderdelen samenvoegen levert een zelfstandige script op die je in elk project kunt gebruiken. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Sla het bestand op, pas het pad naar je afbeelding aan, en voer `python extract_text_from_image.py` uit. Je zou de opgeschoonde factuurtekst in de console moeten zien. + +## Veelgestelde vragen (FAQ) + +**Q: Werkt dit op alleen‑CPU machines?** +A: Absoluut. Als er geen GPU wordt gedetecteerd, valt Aspose AI terug op CPU‑executie, hoewel het langzamer zal zijn. Je kunt CPU forceren door `model_cfg.gpu_layers = 0` in te stellen. + +**Q: Wat als mijn facturen in een andere taal dan Engels zijn?** +A: Verander `ocr_engine.language` naar de juiste enum‑waarde (bijv. `aocr.Language.Spanish`). Het spell‑check model is meertalig, maar je krijgt mogelijk betere resultaten met een taalspecifiek model. + +**Q: Kan ik meerdere afbeeldingen in een lus verwerken?** +A: Ja. Plaats gewoon de laad‑, herkennings‑ en post‑processingstappen binnen een `for`‑lus. Vergeet niet `ocr_ai.free_resources()` aan te roepen na de lus of na elke batch als je dezelfde AI‑instantie hergebruikt. + +**Q: Hoe groot is de model‑download?** +A: Ongeveer 1,5 GB voor de gekwantiseerde `q4_k_m` versie. Het wordt gecached na de eerste uitvoering, zodat volgende runs direct zijn. + +## Conclusie + +In deze tutorial hebben we laten zien hoe je **tekst uit afbeelding** kunt extraheren met Aspose OCR, een klein AI‑model configureert, een spell‑check post‑processor toepast, en veilig **GPU‑bronnen vrijgeeft**. De workflow omvat alles van het laden van de afbeelding tot het opruimen, waardoor je een betrouwbare pijplijn krijgt voor **tekst van factuur herkennen** scenario's. + +Volgende stappen? Probeer de spell‑check te vervangen door een aangepast entity‑extractie model + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/dutch/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/dutch/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..4999dbcdd --- /dev/null +++ b/ocr/dutch/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,215 @@ +--- +category: general +date: 2026-05-03 +description: Hoe afbeeldingen in batch OCR’en met Aspose OCR en AI‑spellingscontrole. + Leer tekst uit afbeeldingen te extraheren, spellingscontrole toe te passen, AI‑resources + gratis te gebruiken en OCR‑fouten te corrigeren. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: nl +og_description: Hoe batch OCR‑afbeeldingen uit te voeren met Aspose OCR en AI‑spellingcontrole. + Volg een stapsgewijze handleiding om tekst uit afbeeldingen te extraheren, spellingcontrole + toe te passen, gratis AI‑bronnen te gebruiken en OCR‑fouten te corrigeren. +og_title: Hoe batch-OCR te doen met Aspose OCR – Complete Python‑tutorial +tags: +- OCR +- Python +- AI +- Aspose +title: Hoe batch‑OCR uit te voeren met Aspose OCR – Volledige Python‑gids +url: /nl/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Hoe batch OCR uit te voeren met Aspose OCR – Volledige Python-gids + +Heb je je ooit afgevraagd **hoe je batch OCR** kunt toepassen op een hele map met gescande PDF‑s of foto‑s zonder voor elk bestand een apart script te schrijven? Je bent niet de enige. In veel real‑world pipelines moet je **tekst uit afbeeldingen extraheren**, spelfouten opschonen en tenslotte alle AI‑bronnen die je hebt toegewezen vrijgeven. Deze tutorial laat je precies zien hoe je dat doet met Aspose OCR, een lichte AI‑post‑processor, en een paar regels Python. + +We lopen stap voor stap door het initialiseren van de OCR‑engine, het koppelen van een AI‑spell‑checker, het itereren over een map met afbeeldingen, en het opruimen van het model daarna. Aan het einde heb je een kant‑klaar script dat **OCR‑fouten corrigeert** automatisch en **AI‑bronnen vrijgeeft** zodat je GPU tevreden blijft. + +## Wat je nodig hebt + +- Python 3.9+ (de code gebruikt type‑hints maar werkt ook op eerdere 3.x‑versies) +- `asposeocr`‑package (`pip install asposeocr`) – dit levert de OCR‑engine. +- Toegang tot het Hugging Face‑model `bartowski/Qwen2.5-3B-Instruct-GGUF` (automatisch gedownload). +- Een GPU met ten minste een paar GB VRAM (het script zet `gpu_layers = 30`, je kunt dit verlagen indien nodig). + +Geen externe services, geen betaalde API’s – alles draait lokaal. + +--- + +## Stap 1: Stel de OCR‑engine in – **Hoe batch OCR** efficiënt uitvoeren + +Voordat we duizend afbeeldingen kunnen verwerken, hebben we een solide OCR‑engine nodig. Aspose OCR laat ons taal en herkenningsmodus kiezen in één enkele oproep. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Waarom dit belangrijk is:** Het instellen van `recognize_mode` op `Plain` houdt de output lichtgewicht, wat ideaal is wanneer je later een spell‑check wilt uitvoeren. Als je lay‑outinformatie nodig hebt, schakel je over naar `Layout`, maar dat voegt overhead toe die je waarschijnlijk niet wilt in een batch‑taak. + +> **Pro tip:** Als je te maken hebt met meertalige scans, kun je een lijst doorgeven zoals `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Stap 2: Initialise­er de AI‑post‑processor – **Pas spell‑check toe** op OCR‑output + +Aspose AI wordt geleverd met een ingebouwde post‑processor die elk model kan draaien dat je wilt. Hier halen we een gekwantiseerd Qwen 2.5‑model van Hugging Face en koppelen we de spell‑check‑routine. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Waarom dit belangrijk is:** Het model is gekwantiseerd (`q4_k_m`), wat het geheugenverbruik sterk vermindert terwijl het toch een degelijke taalbegrip levert. Door `set_post_processor` aan te roepen vertellen we Aspose AI om de **apply spell check**‑stap automatisch uit te voeren op elke string die we invoeren. + +> **Let op:** Als je GPU geen 30 lagen aankan, verlaag dan het aantal naar 15 of zelfs 5 – het script werkt nog steeds, alleen iets trager. + +--- + +## Stap 3: Voer OCR uit en **corrigeer OCR‑fouten** op één afbeelding + +Nu zowel de OCR‑engine als de AI‑spell‑checker klaar zijn, combineren we ze. Deze functie laadt een afbeelding, extraheert ruwe tekst, en laat vervolgens de AI‑post‑processor deze opschonen. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Waarom dit belangrijk is:** Het direct voeden van de ruwe OCR‑string aan het AI‑model geeft ons een **correct OCR errors**‑pass zonder dat we regexes of aangepaste woordenboeken hoeven te schrijven. Het model begrijpt context, dus het kan “recieve” → “receive” en zelfs subtielere fouten corrigeren. + +--- + +## Stap 4: **Tekst extraheren uit afbeeldingen** in bulk – De echte batch‑lus + +Hier komt de magie van **hoe batch OCR** tot uiting. We itereren over een map, slaan niet‑ondersteunde bestanden over, en schrijven elke gecorrigeerde output naar een `.txt`‑bestand. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Verwachte output + +Voor een afbeelding met de zin *“The quick brown fox jumps over the lazzy dog.”* zie je een tekstbestand met: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Merk op dat het dubbele “z” automatisch is gecorrigeerd – dat is de AI‑spell‑check in actie. + +**Waarom dit belangrijk is:** Door de OCR‑ en AI‑objecten **eenmalig** te maken en ze herhaaldelijk te gebruiken, vermijden we de overhead van het telkens opnieuw laden van het model. Dit is de meest efficiënte manier om **hoe batch OCR** op schaal uit te voeren. + +--- + +## Stap 5: Opruimen – **AI‑bronnen vrijgeven** op de juiste manier + +Wanneer je klaar bent, zorgt `free_resources()` ervoor dat GPU‑geheugen, CUDA‑contexten en eventuele tijdelijke bestanden die het model heeft aangemaakt, worden vrijgegeven. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Het overslaan van deze stap kan leiden tot hangende GPU‑allocaties, wat latere Python‑processen kan laten crashen of VRAM kan opslokken. Beschouw het als het “licht uitdoen” deel van een batch‑taak. + +--- + +## Veelvoorkomende valkuilen & extra tips + +| Probleem | Waar op letten | Oplossing | +|----------|----------------|-----------| +| **Out‑of‑memory‑fouten** | GPU raakt vol na enkele tientallen afbeeldingen | Verlaag `gpu_layers` of schakel over naar CPU (`model_cfg.gpu_layers = 0`). | +| **Ontbrekend taalpakket** | OCR geeft lege strings terug | Zorg dat de `asposeocr`‑versie Engels taaldata bevat; herinstalleer indien nodig. | +| **Niet‑beeldbestanden** | Script crasht bij een verdwaalde `.pdf` | De `if not file_name.lower().endswith(...)`‑guard slaat deze al over. | +| **Spell‑check niet toegepast** | Output lijkt identiek aan ruwe OCR | Controleer of `ai_processor.set_post_processor` vóór de lus is aangeroepen. | +| **Trage batch‑snelheid** | Duurt >5 seconden per afbeelding | Zet `model_cfg.allow_auto_download = "false"` na de eerste run, zodat het model niet telkens opnieuw wordt gedownload. | + +**Pro tip:** Als je **tekst uit afbeeldingen** wilt extraheren in een andere taal dan Engels, wijzig dan eenvoudig `ocr_engine.language` naar de juiste enum (bijv. `aocr.Language.French`). Dezelfde AI‑post‑processor past nog steeds spell‑check toe, maar je wilt wellicht een taalspecifiek model voor optimale resultaten. + +--- + +## Samenvatting & vervolgstappen + +We hebben de volledige pipeline behandeld voor **hoe batch OCR**: + +1. **Initialiseer** een plain‑text OCR‑engine voor Engels. +2. **Configureer** een AI‑spell‑check‑model en bind het als post‑processor. +3. **Voer** OCR uit op elke afbeelding en laat de AI **OCR‑fouten corrigeren** automatisch. +4. **Loop** over een map om **tekst uit afbeeldingen** in bulk te **extraheren**. +5. **Vrijgeven** van AI‑bronnen zodra de taak klaar is. + +Vanaf hier kun je: + +- De gecorrigeerde tekst doorsturen naar een downstream NLP‑pipeline (sentiment‑analyse, entiteitsextractie, enz.). +- De spell‑check‑post‑processor vervangen door een aangepaste samenvatter via `ai_processor.set_post_processor(your_custom_func, {})`. +- De map‑lus paralleliseren met `concurrent.futures.ThreadPoolExecutor` als je GPU meerdere streams aankan. + +--- + +## Slotgedachten + +Batch‑OCR hoeft geen zware klus te zijn. Door Aspose OCR te combineren met een lichtgewicht AI‑model krijg je een **alles‑in‑één oplossing** die **tekst uit afbeeldingen** **extraheren**, **spell‑check toepassen**, **OCR‑fouten corrigeren**, en **AI‑bronnen netjes vrijgeven**. Probeer het script op een testmap, pas het aantal GPU‑lagen aan op jouw hardware, en je hebt binnen enkele minuten een productie‑klare pipeline. + +Heb je vragen over het aanpassen van het model, het verwerken van PDF‑s, of het integreren in een webservice? Laat een reactie achter of ping me op GitHub. Veel programmeerplezier, en moge je OCR altijd accuraat zijn! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/dutch/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/dutch/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..88396a8ae --- /dev/null +++ b/ocr/dutch/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,300 @@ +--- +category: general +date: 2026-05-03 +description: Python OCR-tutorial die laat zien hoe je PNG‑afbeeldingsbestanden laadt, + tekst uit een afbeelding herkent en gratis AI‑bronnen voor batch‑OCR‑verwerking + biedt. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: nl +og_description: Python OCR‑tutorial leidt je door het laden van PNG‑afbeeldingen, + het herkennen van tekst uit een afbeelding en het omgaan met gratis AI‑bronnen voor + batch‑OCR‑verwerking. +og_title: Python OCR-tutorial – Snelle batch-OCR met gratis AI-bronnen +tags: +- OCR +- Python +- AI +title: Python OCR-tutorial – Batch-OCR-verwerking eenvoudig gemaakt +url: /nl/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR Tutorial – Batch OCR Processing Made Easy + +Heb je ooit een **python ocr tutorial** nodig gehad die je echt in staat stelt om OCR uit te voeren op tientallen PNG‑bestanden zonder je haar uit te trekken? Je bent niet de enige. In veel real‑world projecten moet je **load png image**‑bestanden laden, ze aan een engine voeren, en daarna de AI‑bronnen opruimen wanneer je klaar bent. + +In deze gids lopen we stap voor stap door een compleet, kant‑klaar voorbeeld dat precies laat zien hoe je **recognize text from image**‑bestanden herkent, ze in batch verwerkt, en het onderliggende AI‑geheugen vrijmaakt. Aan het einde heb je een zelfstandige script die je in elk project kunt plaatsen — geen extra poespas, alleen de essentie. + +## What You’ll Need + +- Python 3.10 of nieuwer (de gebruikte syntax maakt gebruik van f‑strings en type hints) +- Een OCR‑bibliotheek die een `engine.recognize`‑methode exposeert – voor demonstratiedoeleinden gaan we uit van een fictief `aocr`‑pakket, maar je kunt Tesseract, EasyOCR, enz. gebruiken +- De `ai`‑helpermodule die in het code‑fragment wordt getoond (deze verzorgt modelinitialisatie en resource‑cleanup) +- Een map vol PNG‑bestanden die je wilt verwerken + +Als je `aocr` of `ai` niet geïnstalleerd hebt, kun je ze nabootsen met stubs – zie de sectie “Optional Stubs” onderaan. + +## Step 1: Initialize the AI Engine (Free AI Resources) + +Voordat je een afbeelding in de OCR‑pipeline stopt, moet het onderliggende model klaar zijn. Eénmalig initialiseren bespaart geheugen en versnelt batch‑taken. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Why this matters:** +Het herhaaldelijk aanroepen van `ai.initialize` voor elke afbeelding zou steeds GPU‑geheugen toewijzen, wat uiteindelijk het script laat crashen. Door `ai.is_initialized()` te controleren garanderen we één enkele allocatie – dat is het “free AI resources”‑principe. + +## Step 2: Load PNG Image Files for Batch OCR Processing + +Nu verzamelen we alle PNG‑bestanden die we door OCR willen laten gaan. Met `pathlib` blijft de code OS‑agnostisch. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Edge case:** +Als de map niet‑PNG‑bestanden bevat (bijv. JPEG’s) worden die genegeerd, zodat `engine.recognize` niet vastloopt op een niet‑ondersteund formaat. + +## Step 3: Run OCR on Each Image and Apply Post‑Processing + +Met de engine klaar en de bestandenlijst voorbereid, kunnen we over de afbeeldingen itereren, ruwe tekst extraheren, en deze door een post‑processor laten gaan die veelvoorkomende OCR‑artefacten (zoals vreemde regeleinden) opruimt. + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Why we separate loading from recognition:** +`aocr.Image.load` kan lazy decoding uitvoeren, wat sneller is voor grote batches. Het expliciet houden van de laadstap maakt het bovendien eenvoudig om een andere afbeeldingsbibliotheek te gebruiken als je later JPEG‑ of TIFF‑bestanden moet verwerken. + +## Step 4: Clean Up – Free AI Resources After the Batch + +Zodra de batch klaar is, moeten we het model vrijgeven om geheugenlekken te voorkomen, vooral op GPU‑machines. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Putting It All Together – The Complete Script + +Hieronder vind je één enkel bestand dat de vier stappen samenvoegt tot een samenhangende workflow. Sla het op als `batch_ocr.py` en voer het uit vanaf de commandoregel. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Expected Output + +Het uitvoeren van het script tegen een map met drie PNG‑bestanden kan bijvoorbeeld het volgende afdrukken: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +Het bestand `ocr_results.txt` zal een duidelijke scheiding bevatten voor elke afbeelding, gevolgd door de opgeschoonde OCR‑tekst. + +## Optional Stubs for aocr & ai (If You Don’t Have Real Packages) + +Als je alleen de flow wilt testen zonder zware OCR‑bibliotheken te gebruiken, kun je minimale mock‑modules maken: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Plaats deze mappen naast `batch_ocr.py` en het script zal draaien, waarbij mock‑resultaten worden afgedrukt. + +## Pro Tips & Common Pitfalls + +- **Memory spikes:** Als je duizenden hoge‑resolutie PNG‑s verwerkt, overweeg dan om ze vóór OCR te verkleinen. `aocr.Image.load` accepteert vaak een `max_size`‑argument. +- **Unicode handling:** Open altijd het output‑bestand met `encoding="utf-8"`; OCR‑engines kunnen niet‑ASCII‑tekens produceren. +- **Parallelism:** Voor CPU‑gebonden OCR kun je `ocr_batch` wikkelen in een `concurrent.futures.ThreadPoolExecutor`. Zorg er wel voor dat er slechts één `ai`‑instantie bestaat – het spawnen van veel threads die elk `ai.initialize` aanroepen ondermijnt het “free AI resources”‑doel. +- **Error resilience:** Plaats de per‑afbeelding‑lus in een `try/except`‑blok zodat één corrupte PNG niet de hele batch stopt. + +## Conclusion + +Je hebt nu een **python ocr tutorial** die laat zien hoe je **load png image**‑bestanden verwerkt, **batch OCR processing** uitvoert, en verantwoord **free AI resources** beheert. Het complete, uitvoerbare voorbeeld toont precies hoe je **recognize text from image**‑objecten herkent en daarna opruimt, zodat je het kunt kopiëren‑plakken in je eigen projecten zonder te zoeken naar ontbrekende onderdelen. + +Klaar voor de volgende stap? Probeer de gestubde `aocr`‑ en `ai`‑modules te vervangen door echte bibliotheken zoals `pytesseract` en `torchvision`. Je kunt het script ook uitbreiden om JSON uit te voeren, resultaten naar een database te pushen, of te integreren met een cloud‑storage bucket. De mogelijkheden zijn eindeloos — happy coding! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/dutch/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/dutch/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..979b310b3 --- /dev/null +++ b/ocr/dutch/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: Leer hoe je OCR op een afbeelding uitvoert en tekst met coördinaten extraheert + met gestructureerde OCR‑herkenning. Stap‑voor‑stap Python‑code inbegrepen. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: nl +og_description: Voer OCR uit op een afbeelding en krijg tekst met coördinaten via + gestructureerde OCR-herkenning. Volledig Python‑voorbeeld met uitleg. +og_title: OCR uitvoeren op afbeelding – Tutorial voor gestructureerde tekstelextractie +tags: +- OCR +- Python +- Computer Vision +title: OCR uitvoeren op afbeelding – Complete gids voor gestructureerde tekstextractie +url: /nl/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# OCR uitvoeren op afbeelding – Complete gids voor gestructureerde teksteextractie + +Heb je ooit **run OCR on image** bestanden nodig gehad, maar wist je niet hoe je de exacte posities van elk woord kunt behouden? Je bent niet de enige. In veel projecten—bonnen scannen, formulieren digitaliseren of UI-testen—heb je niet alleen de ruwe tekst nodig, maar ook de begrenzingskaders die aangeven waar elke regel zich op de afbeelding bevindt. + +Deze tutorial laat je een praktische manier zien om *run OCR on image* te doen met de **aocr** engine, **structured OCR recognition** aan te vragen, en vervolgens het resultaat post‑processen terwijl je de geometrie behoudt. Aan het einde kun je **extract text with coordinates** met slechts een paar regels Python, en begrijp je waarom de gestructureerde modus belangrijk is voor vervolgprocessen. + +## What You’ll Learn + +- Hoe je de OCR-engine initialiseert voor **structured OCR recognition**. +- Hoe je een afbeelding invoert en ruwe resultaten ontvangt die lijnbegrenzingen bevatten. +- Hoe je een post‑processor uitvoert die de tekst opschoont zonder de geometrie te verliezen. +- Hoe je over de uiteindelijke regels itereren en elk stuk tekst samen met de begrenzingskader afdrukken. + +Geen magie, geen verborgen stappen—gewoon een compleet, uitvoerbaar voorbeeld dat je in je eigen project kunt gebruiken. + +--- + +## Prerequisites + +Voordat we beginnen, zorg ervoor dat je het volgende geïnstalleerd hebt: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Je hebt ook een afbeeldingsbestand (`input_image.png` of `.jpg`) nodig dat duidelijke, leesbare tekst bevat. Alles van een gescande factuur tot een screenshot werkt, zolang de OCR-engine de tekens kan zien. + +--- + +## Step 1: Initialise the OCR engine for structured recognition + +Het eerste wat we doen is een instantie van `aocr.Engine()` maken en aangeven dat we **structured OCR recognition** willen. Structured mode geeft niet alleen de platte tekst terug, maar ook geometrische gegevens (bounding rectangles) voor elke regel, wat essentieel is wanneer je tekst terug op de afbeelding wilt plaatsen. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Why this matters:** +> In de standaardmodus geeft de engine misschien alleen een tekenreeks van aaneengeschakelde woorden. Structured mode geeft je een hiërarchie van pages → lines → words, each with coordinates, making it far easier to overlay results on the original image or feed them into a layout‑aware model. + +--- + +## Step 2: Run OCR on the image and obtain raw results + +Nu voeren we de afbeelding in de engine. De `recognize`‑aanroep retourneert een `OcrResult`‑object dat een verzameling regels bevat, elk met zijn eigen bounding rectangle. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +Op dit punt bevat `raw_result.lines` objecten met twee belangrijke attributen: + +- `text` – de herkende string voor die regel. +- `bounds` – een tuple zoals `(x, y, width, height)` die de positie van de regel beschrijft. + +--- + +## Step 3: Post‑process while preserving geometry + +Ruwe OCR‑output is vaak ruisachtig: vreemde tekens, verkeerd geplaatste spaties of regelafbrekingsproblemen. De `ai.run_postprocessor`‑functie maakt de tekst schoon maar **keeps the original geometry** intact, zodat je nog steeds nauwkeurige coördinaten hebt. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Pro tip:** Als je domeinspecifieke vocabularies hebt (bijv. product codes), voer dan een aangepast woordenboek in de post‑processor in om de nauwkeurigheid te verbeteren. + +--- + +## Step 4: Extract text with coordinates – iterate and display + +Tot slot lopen we door de opgeschoonde regels, waarbij we de bounding box van elke regel naast de tekst afdrukken. Dit is de kern van **extract text with coordinates**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Expected Output + +Als we aannemen dat de invoerafbeelding twee regels bevat: “Invoice #12345” en “Total: $89.99”, zie je iets als: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +De eerste tuple is de `(x, y, width, height)` van de regel op de originele afbeelding, waardoor je rechthoeken kunt tekenen, tekst kunt markeren of de coördinaten in een ander systeem kunt invoeren. + +--- + +## Visualising the Result (Optional) + +Als je de bounding boxes over de afbeelding wilt zien, kun je Pillow (PIL) gebruiken om rechthoeken te tekenen. Hieronder staat een kort fragment; voel je vrij om het over te slaan als je alleen de ruwe gegevens nodig hebt. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image example showing bounding boxes](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +De alt‑tekst hierboven bevat het **primary keyword**, wat voldoet aan de SEO‑vereiste voor alt‑attributen van afbeeldingen. + +--- + +## Why Structured OCR Recognition Beats Simple Text Extraction + +Je vraagt je misschien af: “Kan ik niet gewoon OCR uitvoeren en de tekst krijgen? Waarom de moeite doen met geometrie?” + +- **Spatial context:** Wanneer je velden op een formulier moet koppelen (bijv. “Date” naast een datumwaarde), geven coördinaten je *waar* de data zich bevindt. +- **Multi‑column layouts:** Eenvoudige lineaire tekst verliest de volgorde; gestructureerde data behoudt de kolomvolgorde. +- **Post‑processing accuracy:** Het kennen van de grootte van de box helpt je bepalen of een woord een koptekst, een voetnoot of een vreemd artefact is. + +Kortom, **structured OCR recognition** geeft je de flexibiliteit om slimmere pipelines te bouwen—of je nu data in een database stopt, doorzoekbare PDF's maakt, of een machine‑learning model traint dat de lay-out respecteert. + +--- + +## Common Edge Cases and How to Handle Them + +| Situation | What to Watch For | Suggested Fix | +|-----------|-------------------|---------------| +| **Rotated or skewed images** | Bounding boxes may be off‑axis. | Pre‑process with deskewing (e.g., OpenCV’s `warpAffine`). | +| **Very small fonts** | Engine may miss characters, leading to empty lines. | Increase image resolution or use `ocr_engine.set_dpi(300)`. | +| **Mixed languages** | Wrong language model can cause garbled text. | Set `ocr_engine.language = ["en", "de"]` before recognition. | +| **Overlapping boxes** | Post‑processor might merge two lines unintentionally. | Verify `line.bounds` after processing; adjust thresholds in `ai.run_postprocessor`. | + +Deze scenario's vroeg aanpakken bespaart je later hoofdpijn, vooral wanneer je de oplossing opschaalt naar honderden documenten per dag. + +--- + +## Full End‑to‑End Script + +Hieronder staat het volledige, kant‑klaar script dat alle stappen samenvoegt. Kopieer‑plak, pas het pad van de afbeelding aan, en je bent klaar om te gaan. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Running this script will: + +1. **Run OCR on image** with structured mode. +2. **Extract text with coordinates** for every line. +3. Optioneel een geannoteerde PNG produceren die de kaders toont. + +--- + +## Conclusion + +Je hebt nu een solide, zelfstandige oplossing om **run OCR on image** en **extract text with coordinates** te gebruiken met **structured OCR recognition**. De code demonstreert elke stap—van engine‑initialisatie tot post‑processing en visuele verificatie—zodat je het kunt aanpassen aan bonnen, formulieren of elk visueel document dat precieze tekstoplokalisatie vereist. + +Wat is de volgende stap? Probeer de `aocr`‑engine te vervangen door een andere bibliotheek (Tesseract, EasyOCR) en kijk hoe hun gestructureerde output verschilt. Experimenteer met verschillende post‑processingstrategieën, zoals spell‑checking of aangepaste regex‑filters, om de nauwkeurigheid voor jouw domein te verhogen. En als je een grotere pipeline bouwt, overweeg dan om de `(text, bounds)`‑paren in een database op te slaan voor latere analyses. + +Veel plezier met coderen, en moge je OCR‑projecten altijd nauwkeurig zijn! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/english/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/english/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..ccf708898 --- /dev/null +++ b/ocr/english/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,228 @@ +--- +category: general +date: 2026-05-03 +description: extract text from image using Aspose OCR and AI spell‑check. Learn how + to OCR image, load image for OCR, recognize text from invoice and release GPU resources. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: en +og_description: extract text from image with Aspose OCR and AI spell‑check. Step‑by‑step + guide covering how to OCR image, load image for OCR, and release GPU resources. +og_title: extract text from image – Complete OCR & Spell‑Check Guide +tags: +- OCR +- Aspose +- AI +- Python +title: extract text from image – OCR with Aspose AI Spell‑Check +url: /python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# extract text from image – Complete OCR & Spell‑Check Guide + +Ever needed to **extract text from image** but weren't sure which library would give you both speed and accuracy? You're not the only one. In many real‑world projects—think invoice processing, receipt digitisation, or scanning contracts—getting clean, searchable text from a picture is the first hurdle. + +The good news is that Aspose OCR paired with a lightweight Aspose AI model can handle that job in a few lines of Python. In this tutorial we’ll walk through **how to OCR image**, load the picture correctly, run a built‑in spell‑check post‑processor, and finally **release GPU resources** so your app stays memory‑friendly. + +By the end of this guide you’ll be able to **recognize text from invoice** images, correct common OCR mistakes automatically, and keep your GPU clean for the next batch. + +--- + +## What You’ll Need + +- Python 3.9 or newer (the code uses type hints but works on earlier 3.x versions) +- `aspose-ocr` and `aspose-ai` packages (install via `pip install aspose-ocr aspose-ai`) +- A CUDA‑enabled GPU is optional; the script will fall back to CPU if none is found. +- An example image, e.g., `sample_invoice.png`, placed in a folder you can reference. + +No heavy ML frameworks, no massive model downloads—just a small Q4‑K‑M quantised model that fits comfortably on most GPUs. + +--- + +## Step 1: Initialise the OCR Engine – extract text from image + +The first thing you do is create an `OcrEngine` instance and tell it which language you expect. Here we pick English and request plain‑text output, which is ideal for downstream processing. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Why this matters:** Setting the language narrows the character set, improving accuracy. The plain‑text mode strips layout information you typically don’t need when you just want to extract text from image. + +--- + +## Step 2: Load image for OCR – how to OCR image + +Now we feed the engine an actual picture. The `Image.load` helper understands common formats (PNG, JPEG, TIFF) and abstracts away file‑IO quirks. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Tip:** If your source images are large, consider resizing them before sending them to the engine; smaller dimensions can cut GPU memory usage without hurting recognition quality. + +--- + +## Step 3: Configure the Aspose AI Model – recognize text from invoice + +Aspose AI ships with a tiny GGUF model that you can auto‑download. The example uses the `Qwen2.5‑3B‑Instruct‑GGUF` repository, quantised to `q4_k_m`. We also tell the runtime to allocate 20 layers on the GPU, which balances speed and VRAM usage. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Behind the scenes:** The quantised model is roughly 1.5 GB on disk, a fraction of a full‑precision model, yet it still captures enough linguistic nuance to flag typical OCR misspellings. + +--- + +## Step 4: Initialise AsposeAI and attach the spell‑check post‑processor + +Aspose AI includes a ready‑made spell‑check post‑processor. By attaching it, every OCR result will be cleaned up automatically. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Why use the post‑processor?** OCR engines often misread “Invoice” as “Invo1ce” or “Total” as “T0tal”. The spell‑check runs a lightweight language model over the raw string and corrects those errors without you writing a custom dictionary. + +--- + +## Step 5: Run the spell‑check post‑processor on the OCR result + +With everything wired up, a single call yields the corrected text. We also print both the original and cleaned versions so you can see the improvement. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Typical output for an invoice might look like this: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Notice how “Invo1ce” turned into the proper word “Invoice”. That’s the power of the built‑in AI spell‑check. + +--- + +## Step 6: Release GPU resources – release gpu resources safely + +If you’re running this in a long‑lived service (e.g., a web API that processes dozens of invoices per minute), you must free the GPU context after each batch. Otherwise you’ll see memory leaks and eventually get “CUDA out of memory” errors. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Pro tip:** Call `free_resources()` inside a `finally` block or a context manager so it always executes, even if an exception occurs. + +--- + +## Full Working Example + +Putting all the pieces together gives you a self‑contained script you can drop into any project. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Save the file, adjust the path to your image, and run `python extract_text_from_image.py`. You should see the cleaned invoice text printed to the console. + +--- + +## Frequently Asked Questions (FAQ) + +**Q: Does this work on CPU‑only machines?** +A: Absolutely. If no GPU is detected, Aspose AI falls back to CPU execution, though it will be slower. You can force CPU by setting `model_cfg.gpu_layers = 0`. + +**Q: What if my invoices are in a language other than English?** +A: Change `ocr_engine.language` to the appropriate enum value (e.g., `aocr.Language.Spanish`). The spell‑check model is multilingual, but you may get better results with a language‑specific model. + +**Q: Can I process multiple images in a loop?** +A: Yes. Just move the loading, recognition, and post‑processing steps inside a `for` loop. Remember to call `ocr_ai.free_resources()` after the loop or after each batch if you’re re‑using the same AI instance. + +**Q: How big is the model download?** +A: Roughly 1.5 GB for the quantised `q4_k_m` version. It’s cached after the first run, so subsequent executions are instant. + +--- + +## Conclusion + +In this tutorial we demonstrated how to **extract text from image** using Aspose OCR, configure a tiny AI model, apply a spell‑check post‑processor, and safely **release GPU resources**. The workflow covers everything from loading the picture to cleaning up after yourself, giving you a reliable pipeline for **recognize text from invoice** scenarios. + +Next steps? Try swapping the spell‑check for a custom entity‑extraction model + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/english/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/english/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..c43706774 --- /dev/null +++ b/ocr/english/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,214 @@ +--- +category: general +date: 2026-05-03 +description: How to batch OCR images using Aspose OCR and AI spell‑check. Learn to + extract text from images, apply spell check, free AI resources and correct OCR errors. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: en +og_description: How to batch OCR images using Aspose OCR and AI spell‑check. Follow + a step‑by‑step guide to extract text from images, apply spell check, free AI resources + and correct OCR errors. +og_title: How to Batch OCR with Aspose OCR – Complete Python Tutorial +tags: +- OCR +- Python +- AI +- Aspose +title: How to Batch OCR with Aspose OCR – Full Python Guide +url: /python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# How to Batch OCR with Aspose OCR – Full Python Guide + +Ever wondered **how to batch OCR** a whole folder of scanned PDFs or photos without writing a separate script for each file? You're not alone. In many real‑world pipelines you’ll need to **extract text from images**, clean up spelling mistakes, and finally free any AI resources you’ve allocated. This tutorial shows you exactly how to do that with Aspose OCR, a lightweight AI post‑processor, and a few lines of Python. + +We’ll walk through initializing the OCR engine, hooking up an AI spell‑checker, looping over a directory of pictures, and cleaning up the model afterwards. By the end you’ll have a ready‑to‑run script that **corrects OCR errors** automatically and releases **free AI resources** so your GPU stays happy. + +## What You’ll Need + +- Python 3.9+ (the code uses type‑hints but works on earlier 3.x versions) +- `asposeocr` package (`pip install asposeocr`) – this provides the OCR engine. +- Access to the Hugging Face model `bartowski/Qwen2.5-3B-Instruct-GGUF` (downloaded automatically). +- A GPU with at least a few GB of VRAM (the script sets `gpu_layers = 30`, you can lower it if needed). + +No external services, no paid APIs – everything runs locally. + +--- + +## Step 1: Set Up the OCR Engine – **How to Batch OCR** Efficiently + +Before we can process a thousand images we need a solid OCR engine. Aspose OCR lets us choose language and recognition mode in a single call. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Why this matters:** Setting `recognize_mode` to `Plain` keeps the output lightweight, which is ideal when you plan to run a spell‑check later. If you needed layout information you’d switch to `Layout`, but that adds overhead you probably don’t want in a batch job. + +> **Pro tip:** If you’re dealing with multilingual scans, you can pass a list like `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Step 2: Initialize the AI Post‑Processor – **Apply Spell Check** to OCR Output + +Aspose AI ships with a built‑in post‑processor that can run any model you like. Here we pull a quantized Qwen 2.5 model from Hugging Face and hook the spell‑check routine. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Why this matters:** The model is quantized (`q4_k_m`), which slashes memory usage while still delivering decent language understanding. By calling `set_post_processor` we tell Aspose AI to run the **apply spell check** step automatically on any string we feed it. + +> **Watch out:** If your GPU cannot handle 30 layers, drop the number to 15 or even 5 – the script will still work, just a bit slower. + +--- + +## Step 3: Run OCR and **Correct OCR Errors** on a Single Image + +Now that both the OCR engine and AI spell‑checker are ready, we combine them. This function loads an image, extracts raw text, then runs the AI post‑processor to clean it up. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Why this matters:** Directly feeding the raw OCR string into the AI model gives us a **correct OCR errors** pass without writing any regexes or custom dictionaries. The model knows context, so it can fix “recieve” → “receive” and even more subtle mistakes. + +--- + +## Step 4: **Extract Text from Images** in Bulk – The Real Batch Loop + +Here’s where the magic of **how to batch OCR** shines. We iterate over a directory, skip unsupported files, and write each corrected output to a `.txt` file. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Expected output + +For an image containing the sentence *“The quick brown fox jumps over the lazzy dog.”* you’ll see a text file with: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Notice the double “z” got corrected automatically – that’s the AI spell‑check in action. + +**Why this matters:** By creating the OCR and AI objects **once** and reusing them, we avoid the overhead of loading the model for every file. This is the most efficient way to **how to batch OCR** at scale. + +--- + +## Step 5: Clean Up – **Free AI Resources** Properly + +When you’re done, calling `free_resources()` releases GPU memory, CUDA contexts, and any temporary files the model created. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Skipping this step can leave dangling GPU allocations, which might crash subsequent Python processes or eat up VRAM. Think of it as the “turn off the lights” part of a batch job. + +--- + +## Common Pitfalls & Extra Tips + +| Issue | What to Look For | Fix | +|-------|------------------|-----| +| **Out‑of‑memory errors** | GPU runs out after a few dozen images | Reduce `gpu_layers` or switch to CPU (`model_cfg.gpu_layers = 0`). | +| **Missing language pack** | OCR returns empty strings | Ensure `asposeocr` version includes English language data; reinstall if needed. | +| **Non‑image files** | Script crashes on a stray `.pdf` | The `if not file_name.lower().endswith(...)` guard already skips them. | +| **Spell‑check not applied** | Output looks identical to raw OCR | Verify `ai_processor.set_post_processor` was called before the loop. | +| **Slow batch speed** | Takes >5 seconds per image | Enable `model_cfg.allow_auto_download = "false"` after the first run, so the model isn’t re‑downloaded each time. | + +**Pro tip:** If you need to **extract text from images** in a language other than English, simply change `ocr_engine.language` to the appropriate enum (e.g., `aocr.Language.French`). The same AI post‑processor will still apply spell‑check, but you might want a language‑specific model for best results. + +--- + +## Recap & Next Steps + +We’ve covered the entire pipeline for **how to batch OCR**: + +1. **Initialize** a plain‑text OCR engine for English. +2. **Configure** an AI spell‑check model and bind it as a post‑processor. +3. **Run** OCR on each image and let the AI **correct OCR errors** automatically. +4. **Loop** over a directory to **extract text from images** in bulk. +5. **Free AI resources** once the job finishes. + +From here you could: + +- Pipe the corrected text into a downstream NLP pipeline (sentiment analysis, entity extraction, etc.). +- Swap the spell‑check post‑processor for a custom summarizer by calling `ai_processor.set_post_processor(your_custom_func, {})`. +- Parallelize the folder loop with `concurrent.futures.ThreadPoolExecutor` if your GPU can handle multiple streams. + +--- + +## Final Thoughts + +Batching OCR doesn’t have to be a chore. By leveraging Aspose OCR together with a lightweight AI model, you get a **one‑stop solution** that **extracts text from images**, **applies spell check**, **corrects OCR errors**, and **frees AI resources** cleanly. Give the script a spin on a test folder, tweak the GPU layer count to match your hardware, and you’ll have a production‑ready pipeline in minutes. + +Got questions about tweaking the model, handling PDFs, or integrating this into a web service? Drop a comment below or ping me on GitHub. Happy coding, and may your OCR be ever accurate! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/english/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/english/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..c584321bf --- /dev/null +++ b/ocr/english/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,298 @@ +--- +category: general +date: 2026-05-03 +description: Python OCR tutorial that shows how to load PNG image files, recognize + text from image and free AI resources for batch OCR processing. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: en +og_description: Python OCR tutorial walks you through loading PNG images, recognizing + text from image and handling free AI resources for batch OCR processing. +og_title: Python OCR Tutorial – Quick Batch OCR with Free AI Resources +tags: +- OCR +- Python +- AI +title: Python OCR Tutorial – Batch OCR Processing Made Easy +url: /python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR Tutorial – Batch OCR Processing Made Easy + +Ever needed a **python ocr tutorial** that actually lets you run OCR on dozens of PNG files without pulling your hair out? You're not alone. In many real‑world projects you have to **load png image** files, feed them to an engine, and then clean up the AI resources when you're done. + +In this guide we’ll walk through a complete, ready‑to‑run example that shows exactly how to **recognize text from image** files, process them in batch, and free up the underlying AI memory. By the end you’ll have a self‑contained script you can drop into any project—no extra fluff, just the essentials. + +## What You’ll Need + +- Python 3.10 or newer (the syntax used here relies on f‑strings and type hints) +- An OCR library that exposes an `engine.recognize` method – for demo purposes we’ll assume a fictional `aocr` package, but you can swap in Tesseract, EasyOCR, etc. +- The `ai` helper module shown in the code snippet (it handles model initialization and resource cleanup) +- A folder full of PNG files you want to process + +If you don’t have `aocr` or `ai` installed, you can mimic them with stubs – see the “Optional Stubs” section near the end. + +## Step 1: Initialize the AI Engine (Free AI Resources) + +Before you feed any image into the OCR pipeline, the underlying model must be ready. Initializing only once saves memory and speeds up batch jobs. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Why this matters:** +Calling `ai.initialize` repeatedly for each image would allocate GPU memory over and over, eventually crashing the script. By checking `ai.is_initialized()` we guarantee a single allocation – that’s the “free AI resources” principle. + +## Step 2: Load PNG Image Files for Batch OCR Processing + +Now we gather all the PNG files we want to run through OCR. Using `pathlib` keeps the code OS‑agnostic. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Edge case:** +If the folder contains non‑PNG files (e.g., JPEGs) they’ll be ignored, preventing `engine.recognize` from choking on an unsupported format. + +## Step 3: Run OCR on Each Image and Apply Post‑Processing + +With the engine ready and the file list prepared, we can loop over the images, extract raw text, and hand it to a post‑processor that cleans up common OCR artefacts (like stray line breaks). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Why we separate loading from recognition:** +`aocr.Image.load` may perform lazy decoding, which is faster for large batches. Keeping the load step explicit also makes it easy to swap in a different image library if you later need to handle JPEG or TIFF files. + +## Step 4: Clean Up – Free AI Resources After the Batch + +Once the batch is done, we must release the model to avoid memory leaks, especially on GPU‑enabled machines. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Putting It All Together – The Complete Script + +Below is a single file that stitches the four steps into a cohesive workflow. Save it as `batch_ocr.py` and run it from the command line. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Expected Output + +Running the script against a folder containing three PNGs might print: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +The `ocr_results.txt` file will contain a clear delimiter for each image followed by the cleaned OCR text. + +## Optional Stubs for aocr & ai (If You Don’t Have Real Packages) + +If you just want to test the flow without pulling in heavyweight OCR libraries, you can create minimal mock modules: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Place these folders beside `batch_ocr.py` and the script will run, printing mock results. + +## Pro Tips & Common Pitfalls + +- **Memory spikes:** If you’re processing thousands of high‑resolution PNGs, consider resizing them before OCR. `aocr.Image.load` often accepts a `max_size` argument. +- **Unicode handling:** Always open the output file with `encoding="utf-8"`; OCR engines can emit non‑ASCII characters. +- **Parallelism:** For CPU‑bound OCR you can wrap `ocr_batch` in a `concurrent.futures.ThreadPoolExecutor`. Just remember to keep a single `ai` instance – spawning many threads that each call `ai.initialize` defeats the “free AI resources” goal. +- **Error resilience:** Wrap the per‑image loop in a `try/except` block so a single corrupted PNG won’t abort the whole batch. + +## Conclusion + +You now have a **python ocr tutorial** that demonstrates how to **load png image** files, perform **batch OCR processing**, and responsibly manage **free AI resources**. The complete, runnable example shows exactly how to **recognize text from image** objects and clean up afterward, so you can copy‑paste it into your own projects without hunting for missing pieces. + +Ready for the next step? Try swapping the stubbed `aocr` and `ai` modules with real libraries like `pytesseract` and `torchvision`. You can also extend the script to output JSON, push results to a database, or integrate with a cloud storage bucket. The sky’s the limit—happy coding! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/english/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/english/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..77d9c76bc --- /dev/null +++ b/ocr/english/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: Learn how to run OCR on image and extract text with coordinates using + structured OCR recognition. Step‑by‑step Python code included. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: en +og_description: Run OCR on image and get text with coordinates using structured OCR + recognition. Full Python example with explanations. +og_title: Run OCR on image – Structured Text Extraction Tutorial +tags: +- OCR +- Python +- Computer Vision +title: Run OCR on image – Complete Guide to Structured Text Extraction +url: /python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Run OCR on image – Complete Guide to Structured Text Extraction + +Ever needed to **run OCR on image** files but weren’t sure how to keep the exact positions of each word? You’re not alone. In many projects—receipt scanning, form digitisation, or UI testing—you need not only the raw text but also the bounding boxes that tell you where each line lives on the picture. + +This tutorial shows you a practical way to *run OCR on image* using the **aocr** engine, request **structured OCR recognition**, and then post‑process the result while preserving the geometry. By the end you’ll be able to **extract text with coordinates** in just a few lines of Python, and you’ll understand why the structured mode matters for downstream tasks. + +## What You’ll Learn + +- How to initialise the OCR engine for **structured OCR recognition**. +- How to feed an image and receive raw results that include line bounds. +- How to run a post‑processor that cleans up the text without losing geometry. +- How to iterate over the final lines and print each piece of text together with its bounding box. + +No magic, no hidden steps—just a complete, runnable example you can drop into your own project. + +--- + +## Prerequisites + +Before we dive in, make sure you have the following installed: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +You’ll also need an image file (`input_image.png` or `.jpg`) that contains clear, readable text. Anything from a scanned invoice to a screenshot works, as long as the OCR engine can see the characters. + +--- + +## Step 1: Initialise the OCR engine for structured recognition + +The first thing we do is create an instance of `aocr.Engine()` and tell it we want **structured OCR recognition**. Structured mode returns not only the plain text but also geometric data (bounding rectangles) for each line, which is essential when you need to map text back onto the image. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Why this matters:** +> In the default mode the engine might only give you a string of concatenated words. Structured mode gives you a hierarchy of pages → lines → words, each with coordinates, making it far easier to overlay results on the original image or feed them into a layout‑aware model. + +--- + +## Step 2: Run OCR on the image and obtain raw results + +Now we feed the image to the engine. The `recognize` call returns an `OcrResult` object that contains a collection of lines, each with its own bounding rectangle. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +At this point `raw_result.lines` holds objects with two important attributes: + +- `text` – the recognised string for that line. +- `bounds` – a tuple like `(x, y, width, height)` describing the line’s position. + +--- + +## Step 3: Post‑process while preserving geometry + +Raw OCR output is often noisy: stray characters, misplaced spaces, or line‑break issues. The `ai.run_postprocessor` function cleans the text but **keeps the original geometry** intact, so you still have accurate coordinates. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Pro tip:** If you have domain‑specific vocabularies (e.g., product codes), feed a custom dictionary to the post‑processor to improve accuracy. + +--- + +## Step 4: Extract text with coordinates – iterate and display + +Finally, we loop over the cleaned lines, printing each line’s bounding box alongside its text. This is the core of **extract text with coordinates**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Expected Output + +Assuming the input image contains two lines: “Invoice #12345” and “Total: $89.99”, you’ll see something like: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +The first tuple is the `(x, y, width, height)` of the line on the original image, allowing you to draw rectangles, highlight text, or feed the coordinates into another system. + +--- + +## Visualising the Result (Optional) + +If you want to see the bounding boxes overlaid on the image, you can use Pillow (PIL) to draw rectangles. Below is a quick snippet; feel free to skip if you only need the raw data. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image example showing bounding boxes](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +The alt text above contains the **primary keyword**, satisfying the SEO requirement for image alt attributes. + +--- + +## Why Structured OCR Recognition Beats Simple Text Extraction + +You might wonder, “Can’t I just run OCR and get the text? Why bother with geometry?” + +- **Spatial context:** When you need to map fields on a form (e.g., “Date” next to a date value), coordinates tell you *where* the data lives. +- **Multi‑column layouts:** Simple linear text loses ordering; structured data preserves column order. +- **Post‑processing accuracy:** Knowing the box size helps you decide whether a word is a header, a footnote, or a stray artifact. + +In short, **structured OCR recognition** gives you the flexibility to build smarter pipelines—whether you’re feeding data into a database, creating searchable PDFs, or training a machine‑learning model that respects layout. + +--- + +## Common Edge Cases and How to Handle Them + +| Situation | What to Watch For | Suggested Fix | +|-----------|-------------------|---------------| +| **Rotated or skewed images** | Bounding boxes may be off‑axis. | Pre‑process with deskewing (e.g., OpenCV’s `warpAffine`). | +| **Very small fonts** | Engine may miss characters, leading to empty lines. | Increase image resolution or use `ocr_engine.set_dpi(300)`. | +| **Mixed languages** | Wrong language model can cause garbled text. | Set `ocr_engine.language = ["en", "de"]` before recognition. | +| **Overlapping boxes** | Post‑processor might merge two lines unintentionally. | Verify `line.bounds` after processing; adjust thresholds in `ai.run_postprocessor`. | + +Addressing these scenarios early saves you headaches later, especially when you scale the solution to hundreds of documents a day. + +--- + +## Full End‑to‑End Script + +Below is the complete, ready‑to‑run program that ties all the steps together. Copy‑paste, adjust the image path, and you’re good to go. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Running this script will: + +1. **Run OCR on image** with structured mode. +2. **Extract text with coordinates** for every line. +3. Optionally produce an annotated PNG showing the boxes. + +--- + +## Conclusion + +You now have a solid, self‑contained solution to **run OCR on image** and **extract text with coordinates** using **structured OCR recognition**. The code demonstrates every step—from engine initialisation to post‑processing and visual verification—so you can adapt it to receipts, forms, or any visual document that needs precise text localisation. + +What’s next? Try swapping the `aocr` engine for another library (Tesseract, EasyOCR) and see how their structured outputs differ. Experiment with different post‑processing strategies, such as spell‑checking or custom regex filters, to boost accuracy for your domain. And if you’re building a larger pipeline, consider storing the `(text, bounds)` pairs in a database for later analytics. + +Happy coding, and may your OCR projects be ever accurate! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/french/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/french/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..8f0b72481 --- /dev/null +++ b/ocr/french/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,231 @@ +--- +category: general +date: 2026-05-03 +description: extraire du texte d’une image à l’aide d’Aspose OCR et de la correction + orthographique IA. Apprenez comment OCRiser une image, charger une image pour l’OCR, + reconnaître le texte d’une facture et libérer les ressources GPU. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: fr +og_description: extraire du texte d’une image avec Aspose OCR et la correction orthographique + IA. Guide étape par étape couvrant comment OCRiser une image, charger l’image pour + l’OCR et libérer les ressources GPU. +og_title: extraire du texte d’une image – Guide complet d’OCR et de vérification orthographique +tags: +- OCR +- Aspose +- AI +- Python +title: extraire du texte d’une image – OCR avec le correcteur orthographique Aspose + AI +url: /fr/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# extraire du texte d'image – Guide complet OCR & Spell‑Check + +Vous avez déjà eu besoin d'**extraire du texte d'image** sans savoir quelle bibliothèque offrirait à la fois rapidité et précision ? Vous n'êtes pas seul. Dans de nombreux projets réels—pensons au traitement de factures, à la numérisation de reçus ou à la lecture de contrats—obtenir du texte propre et interrogeable à partir d’une photo est le premier obstacle. + +Bonne nouvelle : Aspose OCR associé à un modèle léger Aspose AI peut accomplir cette tâche en quelques lignes de Python. Dans ce tutoriel, nous verrons **comment OCR une image**, charger correctement l’image, exécuter un post‑processeur de vérification orthographique intégré, puis **libérer les ressources GPU** afin que votre application reste économe en mémoire. + +À la fin de ce guide, vous serez capable de **reconnaître le texte d'images de factures**, corriger automatiquement les erreurs courantes d’OCR et garder votre GPU propre pour le lot suivant. + +--- + +## Ce dont vous avez besoin + +- Python 3.9 ou plus récent (le code utilise des annotations de type mais fonctionne sur les versions 3.x antérieures) +- Packages `aspose-ocr` et `aspose-ai` (installez avec `pip install aspose-ocr aspose-ai`) +- Un GPU compatible CUDA est optionnel ; le script reviendra au CPU si aucun n’est détecté. +- Une image d’exemple, par ex. `sample_invoice.png`, placée dans un dossier accessible. + +Pas de frameworks ML lourds, pas de téléchargements de modèles massifs—juste un petit modèle quantisé Q4‑K‑M qui tient confortablement sur la plupart des GPU. + +--- + +## Étape 1 : Initialiser le moteur OCR – extraire du texte d'image + +La première chose à faire est de créer une instance `OcrEngine` et d’indiquer la langue attendue. Ici nous choisissons l’anglais et demandons une sortie en texte brut, idéale pour le traitement en aval. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Pourquoi c'est important :** définir la langue restreint l’ensemble de caractères, améliorant ainsi la précision. Le mode texte brut supprime les informations de mise en page dont vous n’avez généralement pas besoin lorsque vous ne faites qu’extraire du texte d'image. + +--- + +## Étape 2 : Charger l'image pour l'OCR – comment OCR une image + +Nous transmettons maintenant une vraie image au moteur. L’assistant `Image.load` comprend les formats courants (PNG, JPEG, TIFF) et masque les particularités d’E/S de fichiers. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Astuce :** si vos images sources sont volumineuses, pensez à les redimensionner avant de les envoyer au moteur ; des dimensions plus petites peuvent réduire l’utilisation de la mémoire GPU sans nuire à la qualité de reconnaissance. + +--- + +## Étape 3 : Configurer le modèle Aspose AI – reconnaître le texte d'une facture + +Aspose AI fournit un petit modèle GGUF que vous pouvez télécharger automatiquement. L’exemple utilise le dépôt `Qwen2.5‑3B‑Instruct‑GGUF`, quantisé en `q4_k_m`. Nous indiquons également au runtime d’allouer 20 couches sur le GPU, ce qui équilibre vitesse et consommation de VRAM. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Dans les coulisses :** le modèle quantisé occupe environ 1,5 Go sur le disque, une fraction d’un modèle en pleine précision, tout en conservant suffisamment de nuances linguistiques pour repérer les fautes d’OCR typiques. + +--- + +## Étape 4 : Initialiser AsposeAI et attacher le post‑processeur de vérification orthographique + +Aspose AI inclut un post‑processeur de vérification orthographique prêt à l’emploi. En l’attachez, chaque résultat d’OCR sera automatiquement nettoyé. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Pourquoi utiliser le post‑processeur ?** Les moteurs OCR confondent souvent « Invoice » avec « Invo1ce » ou « Total » avec « T0tal ». La vérification orthographique exécute un modèle linguistique léger sur la chaîne brute et corrige ces erreurs sans que vous ayez à créer un dictionnaire personnalisé. + +--- + +## Étape 5 : Exécuter le post‑processeur de vérification orthographique sur le résultat OCR + +Une fois tout branché, un seul appel renvoie le texte corrigé. Nous affichons également les versions originale et nettoyée afin que vous puissiez constater l’amélioration. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Un résultat typique pour une facture pourrait ressembler à ceci : + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Remarquez comment « Invo1ce » est devenu le mot correct « Invoice ». C’est la puissance du correcteur orthographique intégré basé sur l’IA. + +--- + +## Étape 6 : Libérer les ressources GPU – libérer les ressources GPU en toute sécurité + +Si vous exécutez cela dans un service de longue durée (par ex. une API web qui traite des dizaines de factures par minute), vous devez libérer le contexte GPU après chaque lot. Sinon vous verrez des fuites de mémoire et, finalement, des erreurs « CUDA out of memory ». + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Astuce pro :** appelez `free_resources()` dans un bloc `finally` ou un gestionnaire de contexte afin qu’il s’exécute toujours, même en cas d’exception. + +--- + +## Exemple complet fonctionnel + +Assembler toutes les pièces donne un script autonome que vous pouvez intégrer à n’importe quel projet. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Enregistrez le fichier, ajustez le chemin vers votre image, puis lancez `python extract_text_from_image.py`. Vous devriez voir le texte de la facture nettoyé s’afficher dans la console. + +--- + +## Questions fréquentes (FAQ) + +**Q : Cela fonctionne-t-il sur des machines uniquement CPU ?** +R : Absolument. Si aucun GPU n’est détecté, Aspose AI revient à l’exécution CPU, bien que plus lente. Vous pouvez forcer le CPU en réglant `model_cfg.gpu_layers = 0`. + +**Q : Et si mes factures sont dans une langue autre que l’anglais ?** +R : Modifiez `ocr_engine.language` avec la valeur d’énumération appropriée (par ex. `aocr.Language.Spanish`). Le modèle de vérification orthographique est multilingue, mais vous obtiendrez de meilleurs résultats avec un modèle spécifique à la langue. + +**Q : Puis‑je traiter plusieurs images dans une boucle ?** +R : Oui. Placez simplement les étapes de chargement, de reconnaissance et de post‑traitement à l’intérieur d’une boucle `for`. N’oubliez pas d’appeler `ocr_ai.free_resources()` après la boucle ou après chaque lot si vous réutilisez la même instance AI. + +**Q : Quelle est la taille du téléchargement du modèle ?** +R : Environ 1,5 Go pour la version quantisée `q4_k_m`. Il est mis en cache après la première exécution, de sorte que les exécutions suivantes sont instantanées. + +--- + +## Conclusion + +Dans ce tutoriel, nous avons montré comment **extraire du texte d'image** avec Aspose OCR, configurer un petit modèle IA, appliquer un post‑processeur de vérification orthographique et libérer en toute sécurité les **ressources GPU**. Le flux couvre tout, du chargement de l’image au nettoyage final, vous offrant une chaîne fiable pour les scénarios **reconnaître le texte d’une facture**. + +Prochaines étapes ? Essayez de remplacer le spell‑check par un modèle d’**entity‑extraction** personnalisé + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/french/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/french/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..34c2ca6fb --- /dev/null +++ b/ocr/french/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,217 @@ +--- +category: general +date: 2026-05-03 +description: Comment effectuer une reconnaissance OCR par lots d’images avec Aspose + OCR et la vérification orthographique IA. Apprenez à extraire le texte des images, + appliquer la vérification orthographique, libérer les ressources IA et corriger + les erreurs d’OCR. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: fr +og_description: Comment traiter en lot des images avec OCR en utilisant Aspose OCR + et la correction orthographique IA. Suivez un guide étape par étape pour extraire + le texte des images, appliquer la vérification orthographique, exploiter des ressources + IA gratuites et corriger les erreurs d’OCR. +og_title: Comment réaliser une OCR par lots avec Aspose OCR – Tutoriel complet Python +tags: +- OCR +- Python +- AI +- Aspose +title: Comment faire de l'OCR par lots avec Aspose OCR – Guide complet Python +url: /fr/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Comment faire du OCR par lots avec Aspose OCR – Guide complet Python + +Vous vous êtes déjà demandé **comment faire du OCR par lots** sur un dossier entier de PDF numérisés ou de photos sans écrire un script séparé pour chaque fichier ? Vous n'êtes pas seul. Dans de nombreuses pipelines réelles, vous devez **extraire du texte d'images**, corriger les fautes d'orthographe, et enfin libérer les ressources IA que vous avez allouées. Ce tutoriel vous montre exactement comment faire cela avec Aspose OCR, un post‑processeur IA léger, et quelques lignes de Python. + +Nous allons parcourir l'initialisation du moteur OCR, l'intégration d'un correcteur orthographique IA, la boucle sur un répertoire d'images, et le nettoyage du modèle à la fin. À la fin, vous disposerez d'un script prêt à l'emploi qui **corrige automatiquement les erreurs OCR** et libère les **ressources IA** afin que votre GPU reste heureux. + +## Ce dont vous avez besoin + +- Python 3.9+ (le code utilise des annotations de type mais fonctionne sur les versions 3.x antérieures) +- `asposeocr` package (`pip install asposeocr`) – fournit le moteur OCR. +- Accès au modèle Hugging Face `bartowski/Qwen2.5-3B-Instruct-GGUF` (téléchargé automatiquement). +- Un GPU avec au moins quelques Go de VRAM (le script définit `gpu_layers = 30`, vous pouvez le réduire si nécessaire). + +Aucun service externe, aucune API payante – tout s'exécute localement. + +--- + +## Étape 1 : Configurer le moteur OCR – **Comment faire du OCR par lots** efficacement + +Avant de pouvoir traiter mille images, nous avons besoin d'un moteur OCR solide. Aspose OCR nous permet de choisir la langue et le mode de reconnaissance en un seul appel. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Pourquoi c’est important :** Définir `recognize_mode` à `Plain` garde la sortie légère, ce qui est idéal lorsque vous prévoyez d'exécuter une vérification orthographique plus tard. Si vous avez besoin d'informations de mise en page, vous passeriez à `Layout`, mais cela ajoute une surcharge que vous ne voulez probablement pas dans un travail par lots. + +> **Astuce :** Si vous traitez des scans multilingues, vous pouvez passer une liste comme `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Étape 2 : Initialiser le post‑processeur IA – **Appliquer la vérification orthographique** à la sortie OCR + +Aspose AI est fourni avec un post‑processeur intégré qui peut exécuter n'importe quel modèle de votre choix. Ici, nous récupérons un modèle Qwen 2.5 quantifié depuis Hugging Face et connectons la routine de vérification orthographique. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Pourquoi c’est important :** Le modèle est quantifié (`q4_k_m`), ce qui réduit considérablement l'utilisation de mémoire tout en offrant une compréhension linguistique décente. En appelant `set_post_processor`, nous indiquons à Aspose AI d'exécuter automatiquement l'étape **apply spell check** sur toute chaîne que nous lui fournissons. + +> **Attention :** Si votre GPU ne peut pas gérer 30 couches, réduisez le nombre à 15 voire 5 – le script fonctionnera toujours, simplement un peu plus lent. + +--- + +## Étape 3 : Exécuter l'OCR et **corriger les erreurs OCR** sur une seule image + +Maintenant que le moteur OCR et le correcteur orthographique IA sont prêts, nous les combinons. Cette fonction charge une image, extrait le texte brut, puis exécute le post‑processeur IA pour le nettoyer. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Pourquoi c’est important :** Alimenter directement la chaîne OCR brute dans le modèle IA nous donne un passage **correct OCR errors** sans écrire de regexes ou de dictionnaires personnalisés. Le modèle comprend le contexte, il peut corriger « recieve » → « receive » et même des erreurs plus subtiles. + +--- + +## Étape 4 : **Extraire du texte d'images** en masse – La vraie boucle par lots + +C’est ici que la magie du **how to batch OCR** se révèle. Nous parcourons un répertoire, ignorons les fichiers non pris en charge, et écrivons chaque sortie corrigée dans un fichier `.txt`. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Sortie attendue + +Pour une image contenant la phrase *« The quick brown fox jumps over the lazzy dog. »* vous verrez un fichier texte contenant : + +``` +The quick brown fox jumps over the lazy dog. +``` + +Remarquez que le double « z » a été corrigé automatiquement – c’est le correcteur orthographique IA en action. + +**Pourquoi c’est important :** En créant les objets OCR et IA **une seule fois** et en les réutilisant, nous évitons le surcoût de chargement du modèle pour chaque fichier. C’est la façon la plus efficace de **how to batch OCR** à grande échelle. + +--- + +## Étape 5 : Nettoyer – **Libérer les ressources IA** correctement + +Lorsque vous avez terminé, appeler `free_resources()` libère la mémoire GPU, les contextes CUDA, et tous les fichiers temporaires créés par le modèle. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Ignorer cette étape peut laisser des allocations GPU en suspens, ce qui pourrait faire planter les processus Python suivants ou consommer de la VRAM. Considérez cela comme l’étape « éteindre les lumières » d’un travail par lots. + +--- + +## Pièges courants & astuces supplémentaires + +| Issue | What to Look For | Fix | +|-------|------------------|-----| +| **Erreurs de mémoire insuffisante** | Le GPU s'épuise après quelques dizaines d'images | Réduisez `gpu_layers` ou passez au CPU (`model_cfg.gpu_layers = 0`). | +| **Pack de langue manquant** | OCR renvoie des chaînes vides | Assurez‑vous que la version `asposeocr` inclut les données de langue anglaise ; réinstallez si nécessaire. | +| **Fichiers non‑image** | Le script plante sur un `.pdf` errant | La garde `if not file_name.lower().endswith(...)` les ignore déjà. | +| **Vérification orthographique non appliquée** | La sortie ressemble à l'OCR brut | Vérifiez que `ai_processor.set_post_processor` a été appelé avant la boucle. | +| **Vitesse de lot lente** | Prend plus de 5 secondes par image | Activez `model_cfg.allow_auto_download = "false"` après la première exécution, afin que le modèle ne soit pas re‑téléchargé à chaque fois. | + +**Astuce :** Si vous devez **extraire du texte d'images** dans une langue autre que l'anglais, changez simplement `ocr_engine.language` à l'énumération appropriée (par ex., `aocr.Language.French`). Le même post‑processeur IA appliquera toujours la vérification orthographique, mais vous pourriez vouloir un modèle spécifique à la langue pour de meilleurs résultats. + +--- + +## Récapitulatif & étapes suivantes + +Nous avons couvert l’ensemble du pipeline pour **how to batch OCR** : + +1. **Initialiser** un moteur OCR texte brut pour l'anglais. +2. **Configurer** un modèle de vérification orthographique IA et le lier comme post‑processeur. +3. **Exécuter** l'OCR sur chaque image et laisser l'IA **corriger les erreurs OCR** automatiquement. +4. **Boucler** sur un répertoire pour **extraire du texte d'images** en masse. +5. **Libérer les ressources IA** une fois le travail terminé. + +À partir d'ici, vous pourriez : + +- Acheminer le texte corrigé dans une pipeline NLP en aval (analyse de sentiment, extraction d'entités, etc.). +- Remplacer le post‑processeur de vérification orthographique par un résumeur personnalisé en appelant `ai_processor.set_post_processor(your_custom_func, {})`. +- Paralléliser la boucle du dossier avec `concurrent.futures.ThreadPoolExecutor` si votre GPU peut gérer plusieurs flux. + +--- + +## Réflexions finales + +Le traitement OCR par lots n’a pas besoin d’être une corvée. En combinant Aspose OCR avec un modèle IA léger, vous obtenez une **solution tout‑en‑un** qui **extrait du texte d'images**, **applique la vérification orthographique**, **corrige les erreurs OCR**, et **libère proprement les ressources IA**. Testez le script sur un dossier d’essai, ajustez le nombre de couches GPU pour correspondre à votre matériel, et vous disposerez d’un pipeline prêt pour la production en quelques minutes. + +Des questions sur l’ajustement du modèle, la gestion des PDF, ou l’intégration dans un service web ? Laissez un commentaire ci‑dessous ou contactez‑moi sur GitHub. Bon codage, et que votre OCR soit toujours précis ! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/french/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/french/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..bbb82841f --- /dev/null +++ b/ocr/french/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,300 @@ +--- +category: general +date: 2026-05-03 +description: Tutoriel Python OCR montrant comment charger des fichiers image PNG, + reconnaître le texte à partir d’une image et des ressources IA gratuites pour le + traitement OCR par lots. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: fr +og_description: Le tutoriel Python OCR vous guide à travers le chargement d’images + PNG, la reconnaissance de texte à partir d’une image et la gestion des ressources + IA gratuites pour le traitement OCR par lots. +og_title: Tutoriel OCR Python – OCR par lots rapide avec des ressources IA gratuites +tags: +- OCR +- Python +- AI +title: Tutoriel OCR Python – Traitement par lots d'OCR simplifié +url: /fr/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Tutoriel OCR Python – Traitement par lots simplifié + +Vous avez déjà eu besoin d'un **python ocr tutorial** qui vous permette réellement de lancer de l'OCR sur des dizaines de fichiers PNG sans perdre patience ? Vous n'êtes pas seul. Dans de nombreux projets réels, il faut **load png image** des fichiers, les envoyer à un moteur, puis nettoyer les ressources IA une fois terminé. + +Dans ce guide, nous parcourrons un exemple complet, prêt à l’emploi, qui montre exactement comment **recognize text from image** des fichiers, les traiter par lots, et libérer la mémoire IA sous‑jacente. À la fin, vous disposerez d’un script autonome que vous pourrez intégrer à n’importe quel projet—sans fioritures, juste l’essentiel. + +## Ce dont vous avez besoin + +- Python 3.10 ou plus récent (la syntaxe utilisée repose sur les f‑strings et les annotations de type) +- Une bibliothèque OCR qui expose une méthode `engine.recognize` — pour la démonstration, nous supposerons un package fictif `aocr`, mais vous pouvez le remplacer par Tesseract, EasyOCR, etc. +- Le module d’aide `ai` montré dans l’extrait de code (il gère l’initialisation du modèle et le nettoyage des ressources) +- Un dossier rempli de fichiers PNG que vous souhaitez traiter + +Si vous n’avez pas `aocr` ou `ai` installés, vous pouvez les simuler avec des stubs — voir la section « Optional Stubs » près de la fin. + +## Étape 1 : Initialiser le moteur IA (Free AI Resources) + +Avant d’alimenter le pipeline OCR avec une image, le modèle sous‑jacent doit être prêt. L’initialiser une seule fois économise de la mémoire et accélère les traitements par lots. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Pourquoi c’est important :** +Appeler `ai.initialize` à chaque image allouerait de la mémoire GPU à chaque fois, ce qui finirait par faire planter le script. En vérifiant `ai.is_initialized()` nous garantissons une allocation unique — c’est le principe du « free AI resources ». + +## Étape 2 : Charger les fichiers PNG pour le traitement OCR par lots + +Nous rassemblons maintenant tous les fichiers PNG que nous voulons passer à l’OCR. L’utilisation de `pathlib` rend le code indépendant du système d’exploitation. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Cas limite :** +Si le dossier contient des fichiers non‑PNG (par ex. des JPEG), ils seront ignorés, évitant ainsi que `engine.recognize` ne plante sur un format non supporté. + +## Étape 3 : Exécuter l’OCR sur chaque image et appliquer le post‑traitement + +Avec le moteur prêt et la liste de fichiers préparée, nous pouvons parcourir les images, extraire le texte brut, et le transmettre à un post‑processeur qui nettoie les artefacts OCR courants (comme les sauts de ligne parasites). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Pourquoi séparer le chargement de la reconnaissance :** +`aocr.Image.load` peut effectuer un décodage paresseux, ce qui est plus rapide pour de gros lots. Garder l’étape de chargement explicite facilite également le remplacement par une autre bibliothèque d’images si vous devez plus tard gérer des JPEG ou TIFF. + +## Étape 4 : Nettoyage – Libérer les ressources IA après le lot + +Une fois le lot terminé, nous devons libérer le modèle pour éviter les fuites de mémoire, surtout sur les machines équipées de GPU. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Assemblage complet – Le script entier + +Voici un fichier unique qui assemble les quatre étapes en un flux de travail cohérent. Enregistrez‑le sous le nom `batch_ocr.py` et exécutez‑le depuis la ligne de commande. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Résultat attendu + +L’exécution du script sur un dossier contenant trois PNG pourrait afficher : + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +Le fichier `ocr_results.txt` contiendra un séparateur clair pour chaque image suivi du texte OCR nettoyé. + +## Stubs optionnels pour aocr & ai (si vous n’avez pas les vrais paquets) + +Si vous voulez simplement tester le flux sans charger de lourdes bibliothèques OCR, vous pouvez créer des modules factices minimalistes : + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Placez ces dossiers à côté de `batch_ocr.py` et le script s’exécutera, affichant des résultats factices. + +## Astuces pro & pièges courants + +- **Pics de mémoire :** Si vous traitez des milliers de PNG haute résolution, pensez à les redimensionner avant l’OCR. `aocr.Image.load` accepte souvent un argument `max_size`. +- **Gestion Unicode :** Ouvrez toujours le fichier de sortie avec `encoding="utf-8"` ; les moteurs OCR peuvent produire des caractères non‑ASCII. +- **Parallélisme :** Pour un OCR limité par le CPU, vous pouvez envelopper `ocr_batch` dans un `concurrent.futures.ThreadPoolExecutor`. N’oubliez pas de ne garder qu’une seule instance `ai` — créer de nombreux threads qui appellent chacun `ai.initialize` contredit l’objectif « free AI resources ». +- **Résilience aux erreurs :** Enveloppez la boucle par image dans un bloc `try/except` afin qu’un PNG corrompu n’arrête pas tout le traitement. + +## Conclusion + +Vous disposez maintenant d’un **python ocr tutorial** qui montre comment **load png image** des fichiers, réaliser un **batch OCR processing**, et gérer de façon responsable les **free AI resources**. L’exemple complet et exécutable montre exactement comment **recognize text from image** et nettoyer après, afin que vous puissiez le copier‑coller dans vos propres projets sans chercher des pièces manquantes. + +Prêt pour l’étape suivante ? Essayez de remplacer les modules factices `aocr` et `ai` par de vraies bibliothèques comme `pytesseract` et `torchvision`. Vous pouvez également étendre le script pour produire du JSON, pousser les résultats vers une base de données, ou l’intégrer à un bucket de stockage cloud. Le ciel est la limite—bon codage ! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/french/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/french/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..4ebcb8cad --- /dev/null +++ b/ocr/french/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,258 @@ +--- +category: general +date: 2026-05-03 +description: Apprenez à exécuter l'OCR sur une image et à extraire le texte avec ses + coordonnées en utilisant la reconnaissance OCR structurée. Code Python étape par + étape inclus. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: fr +og_description: Exécutez la reconnaissance OCR sur une image et obtenez le texte avec + les coordonnées grâce à la reconnaissance OCR structurée. Exemple complet en Python + avec explications. +og_title: Exécuter la reconnaissance optique de caractères sur une image – Tutoriel + d'extraction de texte structuré +tags: +- OCR +- Python +- Computer Vision +title: Exécuter la reconnaissance optique de caractères sur une image – Guide complet + de l'extraction de texte structuré +url: /fr/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Exécuter OCR sur image – Guide complet de l'extraction de texte structuré + +Vous avez déjà eu besoin de **run OCR on image** fichiers mais vous n'étiez pas sûr de comment conserver les positions exactes de chaque mot ? Vous n'êtes pas seul. Dans de nombreux projets—numérisation de reçus, digitalisation de formulaires ou tests d'interface utilisateur—vous avez besoin non seulement du texte brut mais aussi des boîtes englobantes qui indiquent où chaque ligne se trouve sur l'image. + +Ce tutoriel vous montre une méthode pratique pour *run OCR on image* en utilisant le moteur **aocr**, demander la **structured OCR recognition**, puis post‑traiter le résultat tout en préservant la géométrie. À la fin, vous pourrez **extract text with coordinates** en quelques lignes de Python, et vous comprendrez pourquoi le mode structuré est important pour les tâches en aval. + +## Ce que vous apprendrez + +- Comment initialiser le moteur OCR pour la **structured OCR recognition**. +- Comment fournir une image et recevoir les résultats bruts incluant les limites des lignes. +- Comment exécuter un post‑processor qui nettoie le texte sans perdre la géométrie. +- Comment itérer sur les lignes finales et afficher chaque morceau de texte avec sa boîte englobante. + +Pas de magie, pas d'étapes cachées—juste un exemple complet et exécutable que vous pouvez intégrer dans votre propre projet. + +--- + +## Prérequis + +Avant de commencer, assurez-vous d'avoir installé les éléments suivants : + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Vous aurez également besoin d'un fichier image (`input_image.png` ou `.jpg`) contenant du texte clair et lisible. Tout, d'une facture numérisée à une capture d'écran, fonctionne tant que le moteur OCR peut voir les caractères. + +--- + +## Étape 1 : Initialise le moteur OCR pour la reconnaissance structurée + +La première chose que nous faisons est de créer une instance de `aocr.Engine()` et de lui indiquer que nous voulons la **structured OCR recognition**. Le mode structuré renvoie non seulement le texte brut mais aussi les données géométriques (rectangles englobants) pour chaque ligne, ce qui est essentiel lorsque vous devez mapper le texte sur l'image. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Pourquoi c'est important :** +> En mode par défaut, le moteur peut ne vous fournir qu'une chaîne de mots concaténés. Le mode structuré vous donne une hiérarchie pages → lignes → mots, chacun avec des coordonnées, ce qui facilite grandement la superposition des résultats sur l'image originale ou leur alimentation dans un modèle sensible à la mise en page. + +--- + +## Étape 2 : Exécuter OCR sur l'image et obtenir les résultats bruts + +Nous fournissons maintenant l'image au moteur. L'appel `recognize` renvoie un objet `OcrResult` qui contient une collection de lignes, chacune avec son propre rectangle englobant. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +À ce stade, `raw_result.lines` contient des objets avec deux attributs importants : + +- `text` – la chaîne reconnue pour cette ligne. +- `bounds` – un tuple comme `(x, y, width, height)` décrivant la position de la ligne. + +--- + +## Étape 3 : Post‑processer tout en préservant la géométrie + +La sortie brute d'OCR est souvent bruyante : caractères parasites, espaces mal placés ou problèmes de sauts de ligne. La fonction `ai.run_postprocessor` nettoie le texte mais **conserve la géométrie originale** intacte, de sorte que vous avez toujours des coordonnées précises. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Astuce pro :** Si vous avez des vocabulaires spécifiques à un domaine (par ex., des codes produit), fournissez un dictionnaire personnalisé au post‑processor pour améliorer la précision. + +--- + +## Étape 4 : Extract text with coordinates – itérer et afficher + +Enfin, nous parcourons les lignes nettoyées, affichant la boîte englobante de chaque ligne à côté de son texte. C’est le cœur de **extract text with coordinates**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Résultat attendu + +En supposant que l'image d'entrée contienne deux lignes : « Invoice #12345 » et « Total: $89.99 », vous verrez quelque chose comme : + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +Le premier tuple est le `(x, y, width, height)` de la ligne sur l'image originale, vous permettant de dessiner des rectangles, de mettre en évidence le texte ou d'alimenter les coordonnées dans un autre système. + +--- + +## Visualiser le résultat (optionnel) + +Si vous voulez voir les boîtes englobantes superposées sur l'image, vous pouvez utiliser Pillow (PIL) pour dessiner des rectangles. Voici un extrait rapide ; n'hésitez pas à le sauter si vous n'avez besoin que des données brutes. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image example showing bounding boxes](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +Le texte alternatif ci‑dessus contient le **primary keyword**, répondant à l'exigence SEO pour les attributs alt d'image. + +--- + +## Pourquoi la reconnaissance OCR structurée surpasse l'extraction de texte simple + +Vous vous demandez peut‑être, « Je ne peux pas simplement exécuter OCR et obtenir le texte ? Pourquoi se soucier de la géométrie ? » + +- **Contexte spatial :** Lorsque vous devez mapper les champs d'un formulaire (par ex., « Date » à côté d'une valeur de date), les coordonnées vous indiquent *où* les données se trouvent. +- **Mises en page multi‑colonnes :** Le texte linéaire simple perd l'ordre ; les données structurées conservent l'ordre des colonnes. +- **Précision du post‑processus :** Connaître la taille de la boîte vous aide à décider si un mot est un en‑tête, une note de bas de page ou un artefact parasite. + +En bref, la **structured OCR recognition** vous offre la flexibilité de créer des pipelines plus intelligents—que vous alimentiez des données dans une base de données, créiez des PDF recherchables, ou entraîniez un modèle d'apprentissage automatique qui respecte la mise en page. + +--- + +## Cas limites courants et comment les gérer + +| Situation | Ce qu'il faut surveiller | Solution suggérée | +|-----------|--------------------------|-------------------| +| **Images tournées ou inclinées** | Les boîtes englobantes peuvent être hors axe. | Pré‑traiter avec un redressement (par ex., `warpAffine` d'OpenCV). | +| **Polices très petites** | Le moteur peut manquer des caractères, entraînant des lignes vides. | Augmenter la résolution de l'image ou utiliser `ocr_engine.set_dpi(300)`. | +| **Langues mixtes** | Un mauvais modèle de langue peut produire du texte illisible. | Définir `ocr_engine.language = ["en", "de"]` avant la reconnaissance. | +| **Boîtes qui se chevauchent** | Le post‑processor peut fusionner deux lignes involontairement. | Vérifier `line.bounds` après le traitement ; ajuster les seuils dans `ai.run_postprocessor`. | + +Aborder ces scénarios tôt vous évite des maux de tête plus tard, surtout lorsque vous faites évoluer la solution à des centaines de documents par jour. + +--- + +## Script complet de bout en bout + +Voici le programme complet, prêt à être exécuté, qui relie toutes les étapes. Copiez‑collez, ajustez le chemin de l'image, et vous êtes prêt. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Exécuter ce script permettra de : + +1. **Run OCR on image** avec le mode structuré. +2. **Extract text with coordinates** pour chaque ligne. +3. Optionnellement produire un PNG annoté montrant les boîtes. + +--- + +## Conclusion + +Vous disposez maintenant d'une solution solide et autonome pour **run OCR on image** et **extract text with coordinates** en utilisant la **structured OCR recognition**. Le code montre chaque étape—de l'initialisation du moteur au post‑processing et à la vérification visuelle—afin que vous puissiez l'adapter aux reçus, formulaires ou tout document visuel nécessitant une localisation précise du texte. + +Et ensuite ? Essayez de remplacer le moteur `aocr` par une autre bibliothèque (Tesseract, EasyOCR) et voyez comment leurs sorties structurées diffèrent. Expérimentez différentes stratégies de post‑processing, comme la correction orthographique ou des filtres regex personnalisés, pour améliorer la précision dans votre domaine. Et si vous construisez un pipeline plus large, envisagez de stocker les paires `(text, bounds)` dans une base de données pour des analyses ultérieures. + +Bon codage, et que vos projets OCR soient toujours précis ! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/german/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/german/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..4a95a3860 --- /dev/null +++ b/ocr/german/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,230 @@ +--- +category: general +date: 2026-05-03 +description: Extrahiere Text aus einem Bild mit Aspose OCR und KI‑Rechtschreibprüfung. + Lerne, wie man ein Bild OCRt, das Bild für OCR lädt, Text aus einer Rechnung erkennt + und GPU‑Ressourcen freigibt. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: de +og_description: Text aus Bild mit Aspose OCR und KI‑Rechtschreibprüfung extrahieren. + Schritt‑für‑Schritt‑Anleitung, wie man ein Bild OCRt, das Bild für OCR lädt und + GPU‑Ressourcen freigibt. +og_title: Text aus Bild extrahieren – Vollständiger OCR‑ und Rechtschreibprüfungs‑Leitfaden +tags: +- OCR +- Aspose +- AI +- Python +title: Text aus Bild extrahieren – OCR mit Aspose KI‑Rechtschreibprüfung +url: /de/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Text aus Bild extrahieren – Komplett‑OCR‑ & Rechtschreib‑Check‑Leitfaden + +Haben Sie schon einmal **Text aus einem Bild extrahieren** müssen, waren sich aber nicht sicher, welche Bibliothek sowohl Geschwindigkeit als auch Genauigkeit liefert? Sie sind nicht allein. In vielen Praxis‑Projekten – denken Sie an Rechnungs‑Verarbeitung, Beleg‑Digitalisierung oder das Scannen von Verträgen – ist das Sauber‑machen von durchsuchbarem Text aus einem Bild die erste Hürde. + +Die gute Nachricht: Aspose OCR in Kombination mit einem leichten Aspose AI‑Modell erledigt diese Aufgabe in wenigen Zeilen Python. In diesem Tutorial zeigen wir **wie man ein Bild OCR‑t**, das Bild korrekt lädt, einen integrierten Rechtschreib‑Check‑Post‑Processor ausführt und schließlich **GPU‑Ressourcen freigibt**, damit Ihre Anwendung speichereffizient bleibt. + +Am Ende dieses Leitfadens können Sie **Text aus Rechnungs‑Bildern erkennen**, gängige OCR‑Fehler automatisch korrigieren und Ihre GPU für den nächsten Durchlauf sauber halten. + +--- + +## Was Sie benötigen + +- Python 3.9 oder neuer (der Code verwendet Typ‑Hints, funktioniert aber auch mit früheren 3.x‑Versionen) +- `aspose-ocr` und `aspose-ai` Pakete (Installation via `pip install aspose-ocr aspose-ai`) +- Eine CUDA‑fähige GPU ist optional; das Skript greift auf die CPU zurück, wenn keine gefunden wird. +- Ein Beispielbild, z. B. `sample_invoice.png`, das in einem Ordner liegt, den Sie referenzieren können. + +Keine schweren ML‑Frameworks, keine massiven Modell‑Downloads – nur ein kleines Q4‑K‑M‑quantisiertes Modell, das bequem auf den meisten GPUs passt. + +--- + +## Schritt 1: OCR‑Engine initialisieren – Text aus Bild extrahieren + +Zuerst erstellen Sie eine `OcrEngine`‑Instanz und geben an, welche Sprache Sie erwarten. Hier wählen wir Englisch und verlangen reine Textausgabe, was ideal für nachgelagerte Verarbeitung ist. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Warum das wichtig ist:** Das Setzen der Sprache begrenzt den Zeichensatz und verbessert die Genauigkeit. Der Nur‑Text‑Modus entfernt Layout‑Informationen, die Sie normalerweise nicht benötigen, wenn Sie einfach nur Text aus einem Bild extrahieren wollen. + +--- + +## Schritt 2: Bild für OCR laden – wie man ein Bild OCR‑t + +Jetzt übergeben wir der Engine ein echtes Bild. Der Helfer `Image.load` versteht gängige Formate (PNG, JPEG, TIFF) und abstrahiert Date‑I/O‑Eigenheiten. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Tipp:** Wenn Ihre Quellbilder groß sind, sollten Sie sie vor dem Übergeben an die Engine verkleinern; kleinere Abmessungen reduzieren den GPU‑Speicherverbrauch, ohne die Erkennungsqualität zu beeinträchtigen. + +--- + +## Schritt 3: Aspose‑AI‑Modell konfigurieren – Text aus Rechnung erkennen + +Aspose AI liefert ein winziges GGUF‑Modell, das Sie automatisch herunterladen können. Das Beispiel nutzt das Repository `Qwen2.5‑3B‑Instruct‑GGUF`, quantisiert zu `q4_k_m`. Außerdem weisen wir die Laufzeit an, 20 Schichten auf der GPU zu reservieren, was Geschwindigkeit und VRAM‑Nutzung ausbalanciert. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Im Hintergrund:** Das quantisierte Modell belegt etwa 1,5 GB auf der Festplatte – ein Bruchteil eines Voll‑Präzisions‑Modells, reicht aber aus, um typische OCR‑Rechtschreibfehler zu erkennen. + +--- + +## Schritt 4: AsposeAI initialisieren und den Rechtschreib‑Check‑Post‑Processor anhängen + +Aspose AI enthält einen fertigen Rechtschreib‑Check‑Post‑Processor. Durch das Anhängen wird jedes OCR‑Ergebnis automatisch bereinigt. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Warum den Post‑Processor nutzen?** OCR‑Engines lesen häufig „Invoice“ als „Invo1ce“ oder „Total“ als „T0tal“. Der Rechtschreib‑Check läuft ein leichtes Sprachmodell über den Roh‑String und korrigiert diese Fehler, ohne dass Sie ein eigenes Wörterbuch schreiben müssen. + +--- + +## Schritt 5: Rechtschreib‑Check‑Post‑Processor auf das OCR‑Ergebnis anwenden + +Mit allem verkabelt, liefert ein einziger Aufruf den korrigierten Text. Wir geben sowohl die Original‑ als auch die bereinigte Version aus, damit Sie die Verbesserung sehen können. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Typische Ausgabe für eine Rechnung könnte so aussehen: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Beachten Sie, wie „Invo1ce“ in das korrekte Wort „Invoice“ umgewandelt wurde. Das ist die Kraft des integrierten KI‑Rechtschreib‑Checks. + +--- + +## Schritt 6: GPU‑Ressourcen freigeben – GPU‑Ressourcen sicher freigeben + +Wenn Sie das in einem langlebigen Service (z. B. einer Web‑API, die Dutzende Rechnungen pro Minute verarbeitet) ausführen, müssen Sie den GPU‑Kontext nach jedem Batch freigeben. Andernfalls entstehen Speicher‑Leaks und schließlich „CUDA out of memory“-Fehler. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Pro‑Tipp:** Rufen Sie `free_resources()` in einem `finally`‑Block oder einem Context‑Manager auf, sodass er immer ausgeführt wird, selbst wenn eine Ausnahme auftritt. + +--- + +## Vollständiges funktionierendes Beispiel + +Alle Bausteine zusammen ergeben ein eigenständiges Skript, das Sie in jedes Projekt einbinden können. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Speichern Sie die Datei, passen Sie den Pfad zu Ihrem Bild an und führen Sie `python extract_text_from_image.py` aus. Sie sollten den bereinigten Rechnungstext in der Konsole sehen. + +--- + +## Häufig gestellte Fragen (FAQ) + +**F: Funktioniert das auf reinen CPU‑Maschinen?** +A: Absolut. Wird keine GPU erkannt, fällt Aspose AI auf CPU‑Ausführung zurück, ist jedoch langsamer. Sie können die CPU erzwingen, indem Sie `model_cfg.gpu_layers = 0` setzen. + +**F: Was, wenn meine Rechnungen in einer anderen Sprache als Englisch vorliegen?** +A: Ändern Sie `ocr_engine.language` auf den entsprechenden Enum‑Wert (z. B. `aocr.Language.Spanish`). Das Rechtschreib‑Check‑Modell ist mehrsprachig, aber Sie erhalten möglicherweise bessere Ergebnisse mit einem sprachspezifischen Modell. + +**F: Kann ich mehrere Bilder in einer Schleife verarbeiten?** +A: Ja. Platzieren Sie die Lade‑, Erkennungs‑ und Post‑Processing‑Schritte einfach in einer `for`‑Schleife. Denken Sie daran, `ocr_ai.free_resources()` nach der Schleife oder nach jedem Batch aufzurufen, wenn Sie dieselbe AI‑Instanz wiederverwenden. + +**F: Wie groß ist der Modell‑Download?** +A: Etwa 1,5 GB für die quantisierte `q4_k_m`‑Version. Nach dem ersten Lauf wird er zwischengespeichert, sodass nachfolgende Ausführungen sofort starten. + +--- + +## Fazit + +In diesem Tutorial haben wir gezeigt, wie man **Text aus Bild extrahiert** mit Aspose OCR, ein kleines KI‑Modell konfiguriert, einen Rechtschreib‑Check‑Post‑Processor anwendet und sicher **GPU‑Ressourcen freigibt**. Der Workflow deckt alles ab – vom Laden des Bildes bis zum Aufräumen – und liefert Ihnen eine zuverlässige Pipeline für **Text aus Rechnung erkennen**‑Szenarien. + +Nächste Schritte? Ersetzen Sie den Rechtschreib‑Check durch ein benutzerdefiniertes Entity‑Extraction‑Modell + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/german/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/german/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..8f2d729fc --- /dev/null +++ b/ocr/german/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,216 @@ +--- +category: general +date: 2026-05-03 +description: Wie man Bilder stapelweise mit Aspose OCR und KI‑Rechtschreibprüfung + verarbeitet. Lernen Sie, Text aus Bildern zu extrahieren, Rechtschreibprüfung anzuwenden, + KI‑Ressourcen kostenlos zu nutzen und OCR‑Fehler zu korrigieren. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: de +og_description: Wie man Bilder stapelweise mit Aspose OCR und KI‑Rechtschreibprüfung + verarbeitet. Folgen Sie einer Schritt‑für‑Schritt‑Anleitung, um Text aus Bildern + zu extrahieren, die Rechtschreibprüfung anzuwenden, KI‑Ressourcen freizugeben und + OCR‑Fehler zu korrigieren. +og_title: Wie man Batch-OCR mit Aspose OCR durchführt – Komplettes Python‑Tutorial +tags: +- OCR +- Python +- AI +- Aspose +title: Wie man Batch‑OCR mit Aspose OCR durchführt – Vollständiger Python‑Leitfaden +url: /de/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Wie man Batch-OCR mit Aspose OCR durchführt – Vollständige Python-Anleitung + +Haben Sie sich jemals gefragt, **wie man Batch-OCR** für einen ganzen Ordner gescannter PDFs oder Fotos durchführt, ohne für jede Datei ein separates Skript zu schreiben? Sie sind nicht allein. In vielen realen Pipelines müssen Sie **Text aus Bildern extrahieren**, Rechtschreibfehler bereinigen und schließlich alle zugewiesenen KI‑Ressourcen freigeben. Dieses Tutorial zeigt Ihnen genau, wie Sie das mit Aspose OCR, einem leichten KI‑Post‑Processor, und ein paar Zeilen Python erreichen. + +Wir führen Sie durch die Initialisierung der OCR‑Engine, das Anschließen eines KI‑Rechtschreibprüfers, das Durchlaufen eines Bildverzeichnisses und das Aufräumen des Modells im Anschluss. Am Ende haben Sie ein einsatzbereites Skript, das **OCR‑Fehler** automatisch **korrigiert** und **KI‑Ressourcen freigibt**, sodass Ihre GPU zufrieden bleibt. + +## Was Sie benötigen + +- Python 3.9+ (der Code verwendet Typ‑Hints, funktioniert aber auch mit früheren 3.x‑Versionen) +- `asposeocr`‑Paket (`pip install asposeocr`) – stellt die OCR‑Engine bereit. +- Zugriff auf das Hugging Face‑Modell `bartowski/Qwen2.5-3B-Instruct-GGUF` (wird automatisch heruntergeladen). +- Eine GPU mit mindestens einigen GB VRAM (das Skript setzt `gpu_layers = 30`, Sie können es bei Bedarf reduzieren). + +Keine externen Dienste, keine kostenpflichtigen APIs – alles läuft lokal. + +--- + +## Schritt 1: OCR‑Engine einrichten – **How to Batch OCR** effizient + +Bevor wir tausend Bilder verarbeiten können, benötigen wir eine solide OCR‑Engine. Aspose OCR ermöglicht es uns, Sprache und Erkennungsmodus in einem einzigen Aufruf auszuwählen. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Warum das wichtig ist:** Das Setzen von `recognize_mode` auf `Plain` hält die Ausgabe leichtgewichtig, was ideal ist, wenn Sie später eine Rechtschreibprüfung durchführen möchten. Wenn Sie Layout‑Informationen benötigen, würden Sie zu `Layout` wechseln, was jedoch zusätzlichen Overhead erzeugt, den Sie in einem Batch‑Job wahrscheinlich nicht wollen. + +> **Pro‑Tipp:** Wenn Sie mit mehrsprachigen Scans arbeiten, können Sie eine Liste übergeben, z. B. `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Schritt 2: KI‑Post‑Processor initialisieren – **Apply Spell Check** auf OCR‑Ausgabe + +Aspose AI wird mit einem integrierten Post‑Processor geliefert, der jedes gewünschte Modell ausführen kann. Hier holen wir ein quantisiertes Qwen 2.5‑Modell von Hugging Face und binden die Rechtschreibprüfungs‑Routine ein. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Warum das wichtig ist:** Das Modell ist quantisiert (`q4_k_m`), was den Speicherverbrauch stark reduziert und dennoch ein gutes Sprachverständnis liefert. Durch den Aufruf von `set_post_processor` teilen wir Aspose AI mit, den **apply spell check**‑Schritt automatisch auf jede übergebene Zeichenkette anzuwenden. + +> **Achtung:** Wenn Ihre GPU nicht 30 Schichten verarbeiten kann, reduzieren Sie die Zahl auf 15 oder sogar 5 – das Skript funktioniert weiterhin, nur etwas langsamer. + +--- + +## Schritt 3: OCR ausführen und **Correct OCR Errors** auf einem einzelnen Bild + +Jetzt, wo sowohl die OCR‑Engine als auch der KI‑Rechtschreibprüfer bereit sind, kombinieren wir sie. Diese Funktion lädt ein Bild, extrahiert Rohtext und lässt anschließend den KI‑Post‑Processor zur Bereinigung laufen. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Warum das wichtig ist:** Das direkte Übergeben des rohen OCR‑Strings an das KI‑Modell ermöglicht einen **correct OCR errors**‑Durchlauf, ohne reguläre Ausdrücke oder benutzerdefinierte Wörterbücher schreiben zu müssen. Das Modell kennt den Kontext und kann z. B. „recieve“ → „receive“ und noch subtilere Fehler korrigieren. + +--- + +## Schritt 4: **Extract Text from Images** in Bulk – Die eigentliche Batch‑Schleife + +Hier kommt die Magie von **how to batch OCR** zum Tragen. Wir iterieren über ein Verzeichnis, überspringen nicht unterstützte Dateien und schreiben jede korrigierte Ausgabe in eine `.txt`‑Datei. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Erwartete Ausgabe + +Für ein Bild, das den Satz *„The quick brown fox jumps over the lazzy dog.“* enthält, sehen Sie eine Textdatei mit: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Beachten Sie, dass das doppelte „z“ automatisch korrigiert wurde – das ist der KI‑Rechtschreibprüfer in Aktion. + +**Warum das wichtig ist:** Durch das Erstellen der OCR‑ und KI‑Objekte **einmal** und deren Wiederverwendung vermeiden wir den Overhead, das Modell für jede Datei neu zu laden. Dies ist der effizienteste Weg, **how to batch OCR** in großem Maßstab durchzuführen. + +--- + +## Schritt 5: Aufräumen – **Free AI Resources** korrekt freigeben + +Wenn Sie fertig sind, gibt der Aufruf von `free_resources()` GPU‑Speicher, CUDA‑Kontexte und alle temporären Dateien, die das Modell erstellt hat, frei. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Das Überspringen dieses Schrittes kann hängende GPU‑Zuweisungen hinterlassen, die nachfolgende Python‑Prozesse zum Absturz bringen oder VRAM verbrauchen können. Betrachten Sie es als den „Licht‑aus“-Teil eines Batch‑Jobs. + +--- + +## Häufige Fallstricke & zusätzliche Tipps + +| Issue | What to Look For | Fix | +|-------|------------------|-----| +| **Out‑of‑Memory‑Fehler** | GPU läuft nach einigen Dutzend Bildern leer | Reduzieren Sie `gpu_layers` oder wechseln Sie zur CPU (`model_cfg.gpu_layers = 0`). | +| **Missing language pack** | OCR liefert leere Zeichenketten | Stellen Sie sicher, dass die `asposeocr`‑Version englische Sprachdaten enthält; bei Bedarf neu installieren. | +| **Non‑image files** | Skript bricht bei einer verirrten `.pdf` ab | Die Bedingung `if not file_name.lower().endswith(...)` überspringt sie bereits. | +| **Spell‑check not applied** | Ausgabe sieht identisch mit dem rohen OCR aus | Vergewissern Sie sich, dass `ai_processor.set_post_processor` vor der Schleife aufgerufen wurde. | +| **Slow batch speed** | Dauert >5 Sekunden pro Bild | Aktivieren Sie `model_cfg.allow_auto_download = "false"` nach dem ersten Durchlauf, damit das Modell nicht jedes Mal neu heruntergeladen wird. | + +**Pro‑Tipp:** Wenn Sie **extract text from images** in einer anderen Sprache als Englisch benötigen, ändern Sie einfach `ocr_engine.language` auf das passende Enum (z. B. `aocr.Language.French`). Der gleiche KI‑Post‑Processor wird weiterhin die Rechtschreibprüfung anwenden, aber für optimale Ergebnisse sollten Sie ein sprachspezifisches Modell verwenden. + +--- + +## Zusammenfassung & nächste Schritte + +Wir haben die gesamte Pipeline für **how to batch OCR** behandelt: + +1. **Initialisieren** Sie eine Plain‑Text‑OCR‑Engine für Englisch. +2. **Konfigurieren** Sie ein KI‑Rechtschreibprüfungs‑Modell und binden Sie es als Post‑Processor. +3. **Führen** Sie OCR für jedes Bild aus und lassen Sie die KI **OCR‑Fehler** automatisch **korrigieren**. +4. **Iterieren** Sie über ein Verzeichnis, um **extract text from images** in großen Mengen zu **extrahieren**. +5. **Free AI resources** freigeben, sobald der Job abgeschlossen ist. + +Ab hier könnten Sie: + +- Den korrigierten Text in eine nachgelagerte NLP‑Pipeline einspeisen (Sentiment‑Analyse, Entitätsextraktion usw.). +- Den Rechtschreib‑Post‑Processor durch einen benutzerdefinierten Zusammenfasser ersetzen, indem Sie `ai_processor.set_post_processor(your_custom_func, {})` aufrufen. +- Die Ordnerschleife mit `concurrent.futures.ThreadPoolExecutor` parallelisieren, falls Ihre GPU mehrere Streams verarbeiten kann. + +--- + +## Abschließende Gedanken + +Batch‑OCR muss kein Aufwand sein. Durch die Kombination von Aspose OCR mit einem leichten KI‑Modell erhalten Sie eine **All‑in‑One‑Lösung**, die **Text aus Bildern extrahiert**, **Rechtschreibprüfung anwendet**, **OCR‑Fehler korrigiert** und **KI‑Ressourcen** sauber freigibt. Testen Sie das Skript in einem Testordner, passen Sie die GPU‑Schicht‑Anzahl an Ihre Hardware an, und Sie haben in wenigen Minuten eine produktionsreife Pipeline. + +Haben Sie Fragen zum Anpassen des Modells, zum Umgang mit PDFs oder zur Integration in einen Web‑Service? Hinterlassen Sie unten einen Kommentar oder kontaktieren Sie mich auf GitHub. Viel Spaß beim Coden, und möge Ihre OCR stets genau sein! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/german/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/german/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..d043bd800 --- /dev/null +++ b/ocr/german/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,299 @@ +--- +category: general +date: 2026-05-03 +description: Python-OCR‑Tutorial, das zeigt, wie man PNG‑Bilddateien lädt, Text aus + Bildern erkennt und kostenlose KI‑Ressourcen für die Batch‑OCR‑Verarbeitung nutzt. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: de +og_description: Das Python-OCR-Tutorial führt Sie durch das Laden von PNG-Bildern, + das Erkennen von Text aus Bildern und die Nutzung kostenloser KI-Ressourcen für + die Batch-OCR-Verarbeitung. +og_title: Python-OCR-Tutorial – Schnelle Batch-OCR mit kostenlosen KI-Ressourcen +tags: +- OCR +- Python +- AI +title: Python-OCR-Tutorial – Stapel‑OCR‑Verarbeitung leicht gemacht +url: /de/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR Tutorial – Batch OCR Processing Made Easy + +Haben Sie schon einmal ein **python ocr tutorial** gesucht, das wirklich ermöglicht, OCR auf Dutzenden von PNG‑Dateien auszuführen, ohne sich die Haare zu raufen? Sie sind nicht allein. In vielen realen Projekten muss man **load png image**‑Dateien laden, sie an eine Engine übergeben und anschließend die KI‑Ressourcen freigeben, wenn man fertig ist. + +In diesem Leitfaden gehen wir Schritt für Schritt durch ein komplettes, sofort ausführbares Beispiel, das genau zeigt, wie man **recognize text from image**‑Dateien erkennt, sie stapelweise verarbeitet und den zugrunde liegenden KI‑Speicher freigibt. Am Ende haben Sie ein eigenständiges Skript, das Sie in jedes Projekt einbinden können – ohne zusätzlichen Schnickschnack, nur das Wesentliche. + +## Was Sie benötigen + +- Python 3.10 oder neuer (die hier verwendete Syntax nutzt f‑Strings und Typ‑Hints) +- Eine OCR‑Bibliothek, die eine `engine.recognize`‑Methode bereitstellt – für die Demo gehen wir von einem fiktiven `aocr`‑Paket aus, Sie können aber Tesseract, EasyOCR usw. einsetzen +- Das im Code‑Snippet gezeigte `ai`‑Hilfsmodul (es übernimmt die Modellinitialisierung und das Aufräumen der Ressourcen) +- Einen Ordner voller PNG‑Dateien, die Sie verarbeiten möchten + +Falls Sie `aocr` oder `ai` nicht installiert haben, können Sie sie mit Stubs nachahmen – siehe den Abschnitt „Optionale Stubs“ am Ende. + +## Schritt 1: Initialisieren der AI Engine (Kostenlose AI Ressourcen) + +Bevor Sie ein Bild in die OCR‑Pipeline einspeisen, muss das zugrunde liegende Modell bereit sein. Einmalige Initialisierung spart Speicher und beschleunigt Batch‑Jobs. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Warum das wichtig ist:** +Ein wiederholtes Aufrufen von `ai.initialize` für jedes Bild würde immer wieder GPU‑Speicher allokieren und schließlich das Skript zum Absturz bringen. Durch die Prüfung von `ai.is_initialized()` garantieren wir eine einzige Allokation – das ist das Prinzip „kostenlose AI‑Ressourcen“. + +## Schritt 2: PNG Image Files für Batch OCR Processing laden + +Jetzt sammeln wir alle PNG‑Dateien, die wir durch OCR laufen lassen wollen. Mit `pathlib` bleibt der Code OS‑unabhängig. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Randfall:** +Falls der Ordner Nicht‑PNG‑Dateien (z. B. JPEGs) enthält, werden diese ignoriert, sodass `engine.recognize` nicht an einem nicht unterstützten Format scheitert. + +## Schritt 3: OCR für jedes Bild ausführen und Post‑Processing anwenden + +Mit der bereitstehenden Engine und der vorbereiteten Dateiliste können wir über die Bilder iterieren, Rohtext extrahieren und ihn einem Nachbearbeiter übergeben, der gängige OCR‑Artefakte (wie überflüssige Zeilenumbrüche) bereinigt. + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Warum wir Laden und Erkennen trennen:** +`aocr.Image.load` kann eine lazy Decodierung durchführen, was bei großen Stapeln schneller ist. Der explizite Ladeschritt erleichtert zudem den Austausch gegen eine andere Bildbibliothek, falls Sie später JPEG‑ oder TIFF‑Dateien verarbeiten müssen. + +## Schritt 4: Aufräumen – Kostenlose AI Ressourcen nach dem Batch freigeben + +Nachdem der Batch abgeschlossen ist, müssen wir das Modell freigeben, um Speicherlecks zu vermeiden, besonders auf GPU‑fähigen Maschinen. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Alles zusammenführen – Das komplette Skript + +Unten finden Sie eine einzelne Datei, die die vier Schritte zu einem zusammenhängenden Workflow verbindet. Speichern Sie sie als `batch_ocr.py` und führen Sie sie über die Kommandozeile aus. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Erwartete Ausgabe + +Ein Aufruf des Skripts in einem Ordner mit drei PNG‑Dateien könnte etwa Folgendes ausgeben: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +Die Datei `ocr_results.txt` enthält einen klaren Trenner für jedes Bild, gefolgt vom bereinigten OCR‑Text. + +## Optionale Stubs für aocr & ai (Falls Sie keine echten Pakete haben) + +Wenn Sie den Ablauf nur testen wollen, ohne schwere OCR‑Bibliotheken zu installieren, können Sie minimale Mock‑Module erstellen: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Legen Sie diese Ordner neben `batch_ocr.py` ab und das Skript läuft und gibt Mock‑Ergebnisse aus. + +## Pro‑Tipps & häufige Fallstricke + +- **Speicherspitzen:** Wenn Sie Tausende hochauflösende PNGs verarbeiten, sollten Sie sie vor dem OCR verkleinern. `aocr.Image.load` akzeptiert häufig ein `max_size`‑Argument. +- **Unicode‑Handling:** Öffnen Sie die Ausgabedatei immer mit `encoding="utf-8"`; OCR‑Engines können Nicht‑ASCII‑Zeichen erzeugen. +- **Parallelität:** Für CPU‑gebundene OCR können Sie `ocr_batch` in einen `concurrent.futures.ThreadPoolExecutor` einbinden. Denken Sie jedoch daran, nur eine einzige `ai`‑Instanz zu verwenden – das Starten vieler Threads, die jeweils `ai.initialize` aufrufen, untergräbt das Ziel „kostenlose AI‑Ressourcen“. +- **Fehlertoleranz:** Umwickeln Sie die Schleife pro Bild mit einem `try/except`‑Block, damit ein einzelnes beschädigtes PNG nicht den gesamten Batch abbricht. + +## Fazit + +Sie haben nun ein **python ocr tutorial**, das zeigt, wie man **load png image**‑Dateien verarbeitet, **batch OCR processing** durchführt und verantwortungsbewusst **free AI resources** verwaltet. Das vollständige, ausführbare Beispiel demonstriert exakt, wie man **recognize text from image**‑Objekte erkennt und anschließend aufräumt, sodass Sie es einfach in Ihre eigenen Projekte kopieren können, ohne nach fehlenden Bausteinen zu suchen. + +Bereit für den nächsten Schritt? Ersetzen Sie die stub‑basierten `aocr`‑ und `ai`‑Module durch echte Bibliotheken wie `pytesseract` und `torchvision`. Sie können das Skript zudem erweitern, um JSON auszugeben, Ergebnisse in eine Datenbank zu schreiben oder mit einem Cloud‑Speicher‑Bucket zu integrieren. Der Himmel ist die Grenze – happy coding! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/german/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/german/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..fb49873c8 --- /dev/null +++ b/ocr/german/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,237 @@ +--- +category: general +date: 2026-05-03 +description: Erfahren Sie, wie Sie OCR auf einem Bild ausführen und Text mit Koordinaten + mithilfe strukturierter OCR‑Erkennung extrahieren. Schritt‑für‑Schritt‑Python‑Code + inklusive. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: de +og_description: Führen Sie OCR auf einem Bild aus und erhalten Sie Text mit Koordinaten + mittels strukturierter OCR-Erkennung. Vollständiges Python‑Beispiel mit Erklärungen. +og_title: OCR auf Bild ausführen – Tutorial zur strukturierten Textextraktion +tags: +- OCR +- Python +- Computer Vision +title: OCR auf Bild ausführen – Vollständiger Leitfaden zur strukturierten Textextraktion +url: /de/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# OCR auf Bild ausführen – Komplettanleitung zur strukturierten Textextraktion + +Haben Sie jemals **run OCR on image**-Dateien ausführen müssen, waren sich aber nicht sicher, wie Sie die genauen Positionen jedes Wortes beibehalten? Sie sind nicht allein. In vielen Projekten – Belegscan, Formulardigitalisierung oder UI-Tests – benötigen Sie nicht nur den Rohtext, sondern auch die Begrenzungsrahmen, die Ihnen zeigen, wo jede Zeile im Bild liegt. + +Dieses Tutorial zeigt Ihnen eine praktische Methode, um *run OCR on image* mit der **aocr**‑Engine zu verwenden, **structured OCR recognition** anzufordern und anschließend das Ergebnis zu post‑processen, wobei die Geometrie erhalten bleibt. Am Ende können Sie **extract text with coordinates** in nur wenigen Zeilen Python durchführen und verstehen, warum der strukturierte Modus für nachgelagerte Aufgaben wichtig ist. + +## Was Sie lernen werden + +- Wie man die OCR-Engine für **structured OCR recognition** initialisiert. +- Wie man ein Bild einspeist und Rohresultate erhält, die Zeilenbegrenzungen enthalten. +- Wie man einen Post‑Processor ausführt, der den Text bereinigt, ohne die Geometrie zu verlieren. +- Wie man über die finalen Zeilen iteriert und jedes Textelement zusammen mit seinem Begrenzungsrahmen ausgibt. + +Kein Zauber, keine versteckten Schritte – nur ein vollständiges, ausführbares Beispiel, das Sie in Ihr eigenes Projekt einbinden können. + +--- + +## Voraussetzungen + +Bevor wir loslegen, stellen Sie sicher, dass Sie Folgendes installiert haben: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Sie benötigen außerdem eine Bilddatei (`input_image.png` oder `.jpg`), die klaren, lesbaren Text enthält. Alles von einer gescannten Rechnung bis zu einem Screenshot funktioniert, solange die OCR-Engine die Zeichen erkennen kann. + +## Schritt 1: Initialisieren der OCR-Engine für strukturierte Erkennung + +Das Erste, was wir tun, ist eine Instanz von `aocr.Engine()` zu erstellen und ihr mitzuteilen, dass wir **structured OCR recognition** wollen. Der strukturierte Modus liefert nicht nur den Klartext, sondern auch geometrische Daten (begrenzende Rechtecke) für jede Zeile, was unerlässlich ist, wenn Sie Text zurück auf das Bild abbilden müssen. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Warum das wichtig ist:** +> Im Standardmodus gibt die Engine möglicherweise nur einen String aus zusammengefügten Wörtern zurück. Der strukturierte Modus liefert Ihnen eine Hierarchie von Seiten → Zeilen → Wörtern, jeweils mit Koordinaten, was das Überlagern der Ergebnisse auf dem Originalbild oder das Einspeisen in ein layout‑bewusstes Modell erheblich erleichtert. + +## Schritt 2: OCR auf dem Bild ausführen und Rohresultate erhalten + +Jetzt geben wir das Bild an die Engine weiter. Der Aufruf `recognize` gibt ein `OcrResult`‑Objekt zurück, das eine Sammlung von Zeilen enthält, von denen jede ihr eigenes Begrenzungsrechteck hat. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +An diesem Punkt enthält `raw_result.lines` Objekte mit zwei wichtigen Attributen: + +- `text` – die erkannte Zeichenkette für diese Zeile. +- `bounds` – ein Tupel wie `(x, y, width, height)`, das die Position der Zeile beschreibt. + +## Schritt 3: Nachbearbeitung bei Erhaltung der Geometrie + +Roh‑OCR‑Ausgaben sind oft verrauscht: fremde Zeichen, falsche Leerzeichen oder Zeilenumbruchprobleme. Die Funktion `ai.run_postprocessor` bereinigt den Text, **keeps the original geometry** unverändert, sodass Sie weiterhin genaue Koordinaten haben. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Pro Tipp:** Wenn Sie domänenspezifische Vokabulare haben (z. B. Produktcodes), übergeben Sie dem Post‑Processor ein benutzerdefiniertes Wörterbuch, um die Genauigkeit zu verbessern. + +## Schritt 4: Text mit Koordinaten extrahieren – iterieren und anzeigen + +Abschließend iterieren wir über die bereinigten Zeilen und geben für jede Zeile den Begrenzungsrahmen zusammen mit dem Text aus. Das ist das Kernstück von **extract text with coordinates**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Erwartete Ausgabe + +Angenommen, das Eingabebild enthält zwei Zeilen: „Invoice #12345“ und „Total: $89.99“, Sie sehen etwa Folgendes: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +Das erste Tupel ist das `(x, y, width, height)` der Zeile im Originalbild, sodass Sie Rechtecke zeichnen, Text hervorheben oder die Koordinaten in ein anderes System einspeisen können. + +## Visualisierung des Ergebnisses (Optional) + +Wenn Sie die Begrenzungsrahmen über das Bild gelegt sehen möchten, können Sie Pillow (PIL) verwenden, um Rechtecke zu zeichnen. Unten ist ein kurzer Ausschnitt; Sie können ihn überspringen, wenn Sie nur die Rohdaten benötigen. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image example showing bounding boxes](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +Der obige Alt‑Text enthält das **primary keyword**, was die SEO‑Anforderung für Bild‑Alt‑Attribute erfüllt. + +## Warum strukturierte OCR-Erkennung die einfache Textextraktion übertrifft + +Sie fragen sich vielleicht: „Kann ich nicht einfach OCR ausführen und den Text erhalten? Warum die Geometrie?“. + +- **Spatial context:** Wenn Sie Felder auf einem Formular zuordnen müssen (z. B. „Date“ neben einem Datumswert), geben Ihnen die Koordinaten an, *wo* die Daten liegen. +- **Multi‑column layouts:** Einfacher linearer Text verliert die Reihenfolge; strukturierte Daten erhalten die Spaltenreihenfolge. +- **Post‑processing accuracy:** Das Wissen um die Boxgröße hilft Ihnen zu entscheiden, ob ein Wort eine Überschrift, eine Fußnote oder ein fremdes Artefakt ist. + +Kurz gesagt, bietet **structured OCR recognition** Ihnen die Flexibilität, intelligentere Pipelines zu bauen – egal, ob Sie Daten in eine Datenbank einspeisen, durchsuchbare PDFs erstellen oder ein Machine‑Learning‑Modell trainieren, das das Layout respektiert. + +## Häufige Randfälle und deren Handhabung + +| Situation | Worauf zu achten ist | Vorgeschlagene Lösung | +|-----------|----------------------|-----------------------| +| **Gedrehte oder verzerrte Bilder** | Begrenzungsrahmen können von der Achse abweichen. | Vorverarbeiten mit Entzerrung (z. B. OpenCV’s `warpAffine`). | +| **Sehr kleine Schriften** | Die Engine kann Zeichen übersehen, was zu leeren Zeilen führt. | Erhöhen Sie die Bildauflösung oder verwenden Sie `ocr_engine.set_dpi(300)`. | +| **Gemischte Sprachen** | Ein falsches Sprachmodell kann unlesbaren Text erzeugen. | Setzen Sie `ocr_engine.language = ["en", "de"]` vor der Erkennung. | +| **Überlappende Boxen** | Der Post‑Processor könnte unbeabsichtigt zwei Zeilen zusammenführen. | Überprüfen Sie `line.bounds` nach der Verarbeitung; passen Sie die Schwellenwerte in `ai.run_postprocessor` an. | + +Das frühzeitige Angehen dieser Szenarien erspart Ihnen später Kopfschmerzen, besonders wenn Sie die Lösung auf Hunderte von Dokumenten pro Tag skalieren. + +## Vollständiges End‑to‑End‑Skript + +Unten finden Sie das komplette, sofort ausführbare Programm, das alle Schritte zusammenführt. Kopieren‑einfügen, passen Sie den Bildpfad an, und Sie können loslegen. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Das Ausführen dieses Skripts bewirkt: + +1. **Run OCR on image** mit strukturiertem Modus. +2. **Extract text with coordinates** für jede Zeile. +3. Optional wird ein annotiertes PNG erzeugt, das die Boxen zeigt. + +## Fazit + +Sie haben nun eine solide, eigenständige Lösung, um **run OCR on image** und **extract text with coordinates** mit **structured OCR recognition** durchzuführen. Der Code demonstriert jeden Schritt – von der Engine‑Initialisierung über die Nachbearbeitung bis hin zur visuellen Verifizierung – sodass Sie ihn an Quittungen, Formulare oder jedes visuelle Dokument anpassen können, das eine präzise Textlokalisierung benötigt. + +Was kommt als Nächstes? Versuchen Sie, die `aocr`‑Engine durch eine andere Bibliothek (Tesseract, EasyOCR) zu ersetzen und sehen Sie, wie sich deren strukturierte Ausgaben unterscheiden. Experimentieren Sie mit verschiedenen Nachbearbeitungsstrategien, wie Rechtschreibprüfung oder benutzerdefinierten Regex‑Filtern, um die Genauigkeit für Ihr Fachgebiet zu steigern. Und wenn Sie eine größere Pipeline bauen, überlegen Sie, die `(text, bounds)`‑Paare in einer Datenbank für spätere Analysen zu speichern. + +Viel Spaß beim Programmieren und mögen Ihre OCR‑Projekte stets genau sein! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/greek/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/greek/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..29f5909b9 --- /dev/null +++ b/ocr/greek/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,230 @@ +--- +category: general +date: 2026-05-03 +description: Εξαγωγή κειμένου από εικόνα χρησιμοποιώντας το Aspose OCR και AI ορθογραφικό + έλεγχο. Μάθετε πώς να κάνετε OCR σε εικόνα, να φορτώνετε εικόνα για OCR, να αναγνωρίζετε + κείμενο από τιμολόγιο και να απελευθερώνετε τους πόρους GPU. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: el +og_description: Εξαγωγή κειμένου από εικόνα με Aspose OCR και AI ορθογραφικό έλεγχο. + Οδηγός βήμα‑προς‑βήμα που καλύπτει πώς να κάνετε OCR σε εικόνα, να φορτώσετε την + εικόνα για OCR και να απελευθερώσετε τους πόρους GPU. +og_title: Εξαγωγή κειμένου από εικόνα – Πλήρης οδηγός OCR & ελέγχου ορθογραφίας +tags: +- OCR +- Aspose +- AI +- Python +title: Εξαγωγή κειμένου από εικόνα – OCR με το Aspose AI Spell‑Check +url: /el/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# εξαγωγή κειμένου από εικόνα – Οδηγός πλήρους OCR & Spell‑Check + +Κάποτε χρειάστηκε να **εξάγετε κείμενο από εικόνα** αλλά δεν ήσασταν σίγουροι ποια βιβλιοθήκη θα σας προσφέρει ταχύτητα και ακρίβεια; Δεν είστε οι μόνοι. Σε πολλά πραγματικά έργα—όπως επεξεργασία τιμολογίων, ψηφιοποίηση αποδείξεων ή σάρωση συμβάσεων—η λήψη καθαρού, αναζητήσιμου κειμένου από μια φωτογραφία είναι το πρώτο εμπόδιο. + +Το καλό νέο είναι ότι το Aspose OCR σε συνδυασμό με ένα ελαφρύ μοντέλο Aspose AI μπορεί να το κάνει αυτό με λίγες γραμμές Python. Σε αυτό το tutorial θα δούμε **πώς να κάνετε OCR σε εικόνα**, πώς να φορτώσετε σωστά την εικόνα, πώς να τρέξετε έναν ενσωματωμένο ελεγκτή ορθογραφίας και, τέλος, **πώς να απελευθερώσετε τους πόρους GPU** ώστε η εφαρμογή σας να παραμένει φιλική στη μνήμη. + +Στο τέλος αυτού του οδηγού θα μπορείτε να **αναγνωρίσετε κείμενο από εικόνες τιμολογίων**, να διορθώνετε αυτόματα κοινά λάθη OCR και να διατηρείτε το GPU σας καθαρό για την επόμενη παρτίδα. + +--- + +## Τι Θα Χρειαστείτε + +- Python 3.9 ή νεότερο (ο κώδικας χρησιμοποιεί type hints αλλά λειτουργεί και σε παλαιότερες εκδόσεις 3.x) +- Πακέτα `aspose-ocr` και `aspose-ai` (εγκατάσταση μέσω `pip install aspose-ocr aspose-ai`) +- Ένα GPU με υποστήριξη CUDA είναι προαιρετικό· το script θα επανέλθει σε CPU αν δεν βρεθεί GPU. +- Μια εικόνα παραδείγματος, π.χ. `sample_invoice.png`, τοποθετημένη σε φάκελο που μπορείτε να αναφέρετε. + +Χωρίς βαριές ML βιβλιοθήκες, χωρίς τεράστιες λήψεις μοντέλων—μόνο ένα μικρό μοντέλο Q4‑K‑M quantised που χωράει άνετα στα περισσότερα GPU. + +--- + +## Βήμα 1: Αρχικοποίηση του OCR Engine – εξαγωγή κειμένου από εικόνα + +Το πρώτο που κάνετε είναι να δημιουργήσετε μια παρουσία `OcrEngine` και να ορίσετε τη γλώσσα που αναμένετε. Εδώ επιλέγουμε τα Αγγλικά και ζητάμε έξοδο plain‑text, η οποία είναι ιδανική για επεξεργασία. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Γιατί είναι σημαντικό:** Η ρύθμιση της γλώσσας περιορίζει το σύνολο χαρακτήρων, βελτιώνοντας την ακρίβεια. Η λειτουργία plain‑text αφαιρεί πληροφορίες διάταξης που συνήθως δεν χρειάζεστε όταν θέλετε απλώς να εξάγετε κείμενο από εικόνα. + +--- + +## Βήμα 2: Φόρτωση εικόνας για OCR – πώς να κάνετε OCR σε εικόνα + +Τώρα τροφοδοτούμε τον κινητήρα με μια πραγματική εικόνα. Η βοηθητική μέθοδος `Image.load` καταλαβαίνει κοινές μορφές (PNG, JPEG, TIFF) και αφαιρεί τις ιδιαιτερότητες του file‑IO. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Συμβουλή:** Αν οι εικόνες προέλευσης είναι μεγάλες, σκεφτείτε να τις αλλάξετε σε μικρότερο μέγεθος πριν τις στείλετε στον κινητήρα· μικρότερες διαστάσεις μπορούν να μειώσουν τη χρήση μνήμης GPU χωρίς να επηρεάσουν την ποιότητα αναγνώρισης. + +--- + +## Βήμα 3: Διαμόρφωση του Aspose AI Model – αναγνώριση κειμένου από τιμολόγιο + +Το Aspose AI έρχεται με ένα μικρό μοντέλο GGUF που μπορείτε να κατεβάσετε αυτόματα. Το παράδειγμα χρησιμοποιεί το αποθετήριο `Qwen2.5‑3B‑Instruct‑GGUF`, quantised σε `q4_k_m`. Επίσης, ορίζουμε στο runtime να εκχωρήσει 20 στρώματα στο GPU, κάτι που εξισορροπεί την ταχύτητα και τη χρήση VRAM. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Πίσω από τη σκηνή:** Το quantised μοντέλο είναι περίπου 1,5 GB στο δίσκο, ένα κλάσμα ενός μοντέλου πλήρους ακρίβειας, αλλά εξακολουθεί να καταγράφει αρκετή γλωσσική λεπτότητα για να εντοπίζει τυπικά ορθογραφικά λάθη OCR. + +--- + +## Βήμα 4: Αρχικοποίηση AsposeAI και προσθήκη του post‑processor ελέγχου ορθογραφίας + +Το Aspose AI περιλαμβάνει έναν έτοιμο post‑processor ελέγχου ορθογραφίας. Με την προσθήκη του, κάθε αποτέλεσμα OCR θα καθαρίζεται αυτόματα. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Γιατί να χρησιμοποιήσετε τον post‑processor;** Οι μηχανές OCR συχνά διαβάζουν “Invoice” ως “Invo1ce” ή “Total” ως “T0tal”. Ο έλεγχος ορθογραφίας τρέχει ένα ελαφρύ μοντέλο γλώσσας πάνω στο ακατέργαστο κείμενο και διορθώνει αυτά τα σφάλματα χωρίς να χρειάζεται να γράψετε προσαρμοσμένο λεξικό. + +--- + +## Βήμα 5: Εκτέλεση του post‑processor ελέγχου ορθογραφίας στο αποτέλεσμα OCR + +Με όλα συνδεδεμένα, μια κλήση δίνει το διορθωμένο κείμενο. Εκτυπώνουμε επίσης τόσο την αρχική όσο και την καθαρισμένη έκδοση ώστε να δείτε τη βελτίωση. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Τυπική έξοδος για ένα τιμολόγιο μπορεί να είναι η εξής: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Παρατηρήστε πώς το “Invo1ce” μετατράπηκε στη σωστή λέξη “Invoice”. Αυτή είναι η δύναμη του ενσωματωμένου AI ελέγχου ορθογραφίας. + +--- + +## Βήμα 6: Απελευθέρωση πόρων GPU – ασφαλής απελευθέρωση πόρων GPU + +Αν τρέχετε αυτόν τον κώδικα σε μια υπηρεσία μακράς διάρκειας (π.χ. ένα web API που επεξεργάζεται δεκάδες τιμολόγια ανά λεπτό), πρέπει να ελευθερώνετε το context του GPU μετά από κάθε παρτίδα. Διαφορετικά θα αντιμετωπίσετε διαρροές μνήμης και τελικά σφάλματα “CUDA out of memory”. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Pro tip:** Καλέστε `free_resources()` μέσα σε ένα `finally` block ή σε έναν context manager ώστε να εκτελείται πάντα, ακόμη και αν προκύψει εξαίρεση. + +--- + +## Πλήρες Παράδειγμα Λειτουργίας + +Συνδυάζοντας όλα τα κομμάτια παίρνετε ένα αυτόνομο script που μπορείτε να ενσωματώσετε σε οποιοδήποτε έργο. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Αποθηκεύστε το αρχείο, προσαρμόστε τη διαδρομή στην εικόνα σας και τρέξτε `python extract_text_from_image.py`. Θα πρέπει να δείτε το καθαρισμένο κείμενο του τιμολογίου να εμφανίζεται στην κονσόλα. + +--- + +## Συχνές Ερωτήσεις (FAQ) + +**Ε: Λειτουργεί αυτό σε μηχανήματα μόνο με CPU;** +Α: Απόλυτα. Αν δεν εντοπιστεί GPU, το Aspose AI επιστρέφει στην εκτέλεση σε CPU, αν και θα είναι πιο αργό. Μπορείτε να εξαναγκάσετε τη χρήση CPU ορίζοντας `model_cfg.gpu_layers = 0`. + +**Ε: Τι γίνεται αν τα τιμολόγια μου είναι σε άλλη γλώσσα εκτός των Αγγλικών;** +Α: Αλλάξτε το `ocr_engine.language` στην αντίστοιχη τιμή enum (π.χ., `aocr.Language.Spanish`). Το μοντέλο ελέγχου ορθογραφίας είναι πολυγλωσσικό, αλλά μπορεί να έχετε καλύτερα αποτελέσματα με μοντέλο ειδικά για τη γλώσσα. + +**Ε: Μπορώ να επεξεργαστώ πολλαπλές εικόνες σε βρόχο;** +Α: Ναι. Απλώς μετακινήστε τα βήματα φόρτωσης, αναγνώρισης και post‑processing μέσα σε έναν `for` βρόχο. Θυμηθείτε να καλέσετε `ocr_ai.free_resources()` μετά τον βρόχο ή μετά από κάθε παρτίδα αν επαναχρησιμοποιείτε την ίδια παρουσία AI. + +**Ε: Πόσο μεγάλο είναι το κατέβασμα του μοντέλου;** +Α: Περίπου 1,5 GB για την έκδοση `q4_k_m`. Αποθηκεύεται στην cache μετά την πρώτη εκτέλεση, οπότε οι επόμενες εκτελέσεις είναι άμεσες. + +--- + +## Συμπέρασμα + +Σε αυτό το tutorial δείξαμε πώς να **εξάγετε κείμενο από εικόνα** χρησιμοποιώντας το Aspose OCR, να διαμορφώσουμε ένα μικρό μοντέλο AI, να εφαρμόσουμε έναν post‑processor ελέγχου ορθογραφίας και να **απελευθερώσουμε με ασφάλεια τους πόρους GPU**. Η ροή εργασίας καλύπτει όλα, από τη φόρτωση της εικόνας μέχρι τον καθαρισμό μετά την ολοκλήρωση, παρέχοντάς σας μια αξιόπιστη pipeline για σενάρια **αναγνώρισης κειμένου από τιμολόγιο**. + +Τι θα κάνετε στη συνέχεια; Δοκιμάστε να αντικαταστήσετε τον έλεγχο ορθογραφίας με ένα προσαρμοσμένο μοντέλο εξαγωγής οντοτήτων. + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/greek/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/greek/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..b280b421e --- /dev/null +++ b/ocr/greek/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,217 @@ +--- +category: general +date: 2026-05-03 +description: Πώς να επεξεργαστείτε μαζικά εικόνες με OCR χρησιμοποιώντας το Aspose + OCR και τον AI ορθογραφικό έλεγχο. Μάθετε να εξάγετε κείμενο από εικόνες, να εφαρμόζετε + ορθογραφικό έλεγχο, να χρησιμοποιείτε δωρεάν πόρους AI και να διορθώνετε σφάλματα + OCR. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: el +og_description: Πώς να επεξεργαστείτε μαζικά εικόνες με OCR χρησιμοποιώντας το Aspose + OCR και τον AI ορθογραφικό έλεγχο. Ακολουθήστε έναν βήμα‑προς‑βήμα οδηγό για την + εξαγωγή κειμένου από εικόνες, την εφαρμογή ορθογραφικού ελέγχου, τη χρήση δωρεάν + AI πόρων και τη διόρθωση σφαλμάτων OCR. +og_title: Πώς να κάνετε μαζική OCR με το Aspose OCR – Πλήρης οδηγός Python +tags: +- OCR +- Python +- AI +- Aspose +title: Πώς να κάνετε ομαδική OCR με το Aspose OCR – Πλήρης οδηγός Python +url: /el/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Πώς να Εκτελέσετε OCR σε Παρτίδες με Aspose OCR – Πλήρης Οδηγός Python + +Έχετε αναρωτηθεί ποτέ **πώς να εκτελέσετε OCR σε παρτίδες** σε ολόκληρο φάκελο σκαναρισμένων PDF ή φωτογραφιών χωρίς να γράψετε ξεχωριστό script για κάθε αρχείο; Δεν είστε μόνοι. Σε πολλές πραγματικές ροές εργασίας θα χρειαστείτε **εξαγωγή κειμένου από εικόνες**, καθαρισμό ορθογραφικών λαθών, και τελικά την απελευθέρωση των πόρων AI που έχετε δεσμεύσει. Αυτό το tutorial σας δείχνει ακριβώς πώς να το κάνετε αυτό με το Aspose OCR, έναν ελαφρύ AI post‑processor, και με λίγες γραμμές Python. + +Θα περάσουμε από την αρχικοποίηση της μηχανής OCR, τη σύνδεση ενός AI ελεγκτή ορθογραφίας, την επανάληψη σε έναν κατάλογο εικόνων, και τον καθαρισμό του μοντέλου στο τέλος. Στο τέλος θα έχετε ένα έτοιμο script που **διορθώνει αυτόματα τα σφάλματα OCR** και απελευθερώνει **ελεύθερους πόρους AI** ώστε η GPU σας να παραμένει χαρούμενη. + +## Τι Θα Χρειαστείτε + +- Python 3.9+ (ο κώδικας χρησιμοποιεί type‑hints αλλά λειτουργεί και σε παλαιότερες εκδόσεις 3.x) +- Πακέτο `asposeocr` (`pip install asposeocr`) – παρέχει τη μηχανή OCR. +- Πρόσβαση στο μοντέλο Hugging Face `bartowski/Qwen2.5-3B-Instruct-GGUF` (κατεβάζεται αυτόματα). +- Μια GPU με τουλάχιστον λίγες GB VRAM (το script ορίζει `gpu_layers = 30`, μπορείτε να το μειώσετε αν χρειαστεί). + +Καμία εξωτερική υπηρεσία, κανένα πληρωμένο API – όλα τρέχουν τοπικά. + +--- + +## Βήμα 1: Ρύθμιση της Μηχανής OCR – **Πώς να Εκτελέσετε OCR σε Παρτίδες** Αποτελεσματικά + +Πριν μπορέσουμε να επεξεργαστούμε χίλιες εικόνες, χρειαζόμαστε μια αξιόπιστη μηχανή OCR. Το Aspose OCR μας επιτρέπει να επιλέξουμε γλώσσα και λειτουργία αναγνώρισης με μία κλήση. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Γιατί είναι σημαντικό:** Ορίζοντας το `recognize_mode` σε `Plain` διατηρεί το αποτέλεσμα ελαφρύ, κάτι ιδανικό όταν σκοπεύετε να εκτελέσετε έλεγχο ορθογραφίας αργότερα. Αν χρειάζεστε πληροφορίες διάταξης, θα πρέπει να αλλάξετε σε `Layout`, αλλά αυτό προσθέτει επιπλέον φόρτο που πιθανότατα δεν θέλετε σε μια εργασία παρτίδας. + +> **Συμβουλή:** Αν αντιμετωπίζετε πολυγλωσσικά σκαναρίσματα, μπορείτε να περάσετε μια λίστα όπως `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Βήμα 2: Αρχικοποίηση του AI Post‑Processor – **Εφαρμογή Ελέγχου Ορθογραφίας** στο Αποτέλεσμα OCR + +Το Aspose AI έρχεται με ενσωματωμένο post‑processor που μπορεί να τρέξει οποιοδήποτε μοντέλο θέλετε. Εδώ κατεβάζουμε ένα ποσοτικοποιημένο μοντέλο Qwen 2.5 από το Hugging Face και το συνδέουμε με τη ρουτίνα ελέγχου ορθογραφίας. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Γιατί είναι σημαντικό:** Το μοντέλο είναι ποσοτικοποιημένο (`q4_k_m`), κάτι που μειώνει τη χρήση μνήμης ενώ εξακολουθεί να παρέχει αξιοπρεπή κατανόηση της γλώσσας. Καλώντας το `set_post_processor` λέμε στο Aspose AI να εκτελεί αυτόματα το βήμα **εφαρμογής ελέγχου ορθογραφίας** σε κάθε συμβολοσειρά που του δίνουμε. + +> **Προσοχή:** Αν η GPU σας δεν μπορεί να διαχειριστεί 30 επίπεδα, μειώστε τον αριθμό σε 15 ή ακόμη και 5 – το script θα λειτουργήσει, απλώς πιο αργά. + +--- + +## Βήμα 3: Εκτέλεση OCR και **Διόρθωση Σφαλμάτων OCR** σε Μία Μοναδική Εικόνα + +Τώρα που η μηχανή OCR και ο AI ελεγκτής ορθογραφίας είναι έτοιμα, τα συνδυάζουμε. Αυτή η συνάρτηση φορτώνει μια εικόνα, εξάγει το ακατέργαστο κείμενο, και στη συνέχεια τρέχει τον AI post‑processor για να το καθαρίσει. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Γιατί είναι σημαντικό:** Η άμεση τροφοδοσία της ακατέργαστης συμβολοσειράς OCR στο μοντέλο AI μας δίνει μια **διόρθωση σφαλμάτων OCR** χωρίς να χρειάζεται να γράψουμε regex ή προσαρμοσμένα λεξικά. Το μοντέλο καταλαβαίνει το πλαίσιο, οπότε μπορεί να διορθώσει “recieve” → “receive” και ακόμη πιο λεπτές ατέλειες. + +--- + +## Βήμα 4: **Εξαγωγή Κειμένου από Εικόνες** Μαζικά – Ο Πραγματικός Βρόχος Παρτίδας + +Εδώ φαίνεται η μαγεία του **πώς να εκτελέσετε OCR σε παρτίδες**. Διατρέχουμε έναν φάκελο, παραλείπουμε μη υποστηριζόμενα αρχεία, και γράφουμε κάθε διορθωμένο αποτέλεσμα σε αρχείο `.txt`. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Αναμενόμενο αποτέλεσμα + +Για μια εικόνα που περιέχει τη φράση *«The quick brown fox jumps over the lazzy dog.»* θα δείτε ένα αρχείο κειμένου με: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Παρατηρήστε ότι το διπλό “z” διορθώθηκε αυτόματα – αυτό είναι το AI ελέγχου ορθογραφίας σε δράση. + +**Γιατί είναι σημαντικό:** Δημιουργώντας τα αντικείμενα OCR και AI **μία φορά** και επαναχρησιμοποιώντας τα, αποφεύγουμε το κόστος φόρτωσης του μοντέλου για κάθε αρχείο. Αυτή είναι η πιο αποδοτική μέθοδος για **πώς να εκτελέσετε OCR σε παρτίδες** σε κλίμακα. + +--- + +## Βήμα 5: Καθαρισμός – **Απελευθέρωση Πόρων AI** Κατάλληλα + +Όταν τελειώσετε, η κλήση `free_resources()` απελευθερώνει τη μνήμη GPU, τα context CUDA, και τυχόν προσωρινά αρχεία που δημιούργησε το μοντέλο. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Η παράλειψη αυτού του βήματος μπορεί να αφήσει «κολλημένες» κατανομές GPU, κάτι που μπορεί να καταρρεύσει επόμενες διεργασίες Python ή να καταναλώσει VRAM. Σκεφτείτε το ως το «σβήσιμο των φώτων» στο τέλος μιας εργασίας παρτίδας. + +--- + +## Συνηθισμένα Προβλήματα & Επιπλέον Συμβουλές + +| Πρόβλημα | Τι να Αναζητήσετε | Διόρθωση | +|----------|-------------------|----------| +| **Σφάλματα έλλειψης μνήμης** | Η GPU εξαντλείται μετά από μερικές δεκάδες εικόνες | Μειώστε το `gpu_layers` ή μεταβείτε σε CPU (`model_cfg.gpu_layers = 0`). | +| **Λείπει το πακέτο γλώσσας** | Το OCR επιστρέφει κενές συμβολοσειρές | Βεβαιωθείτε ότι η έκδοση `asposeocr` περιλαμβάνει τα δεδομένα γλώσσας Αγγλικών· επανεγκαταστήστε αν χρειάζεται. | +| **Μη‑εικόνα αρχεία** | Το script καταρρέει σε ένα τυχαίο `.pdf` | Η προϋπόθεση `if not file_name.lower().endswith(...)` ήδη τα παραλείπει. | +| **Ο έλεγχος ορθογραφίας δεν εφαρμόζεται** | Το αποτέλεσμα είναι πανομοιότυπο με το ακατέργαστο OCR | Επαληθεύστε ότι κλήθηκε το `ai_processor.set_post_processor` πριν από τον βρόχο. | +| **Αργός ρυθμός παρτίδας** | Παίρνει >5 δευτερόλεπτα ανά εικόνα | Ενεργοποιήστε `model_cfg.allow_auto_download = "false"` μετά την πρώτη εκτέλεση, ώστε το μοντέλο να μην ξανακατεβάζεται κάθε φορά. | + +**Συμβουλή:** Αν χρειάζεται να **εξάγετε κείμενο από εικόνες** σε γλώσσα διαφορετική από τα Αγγλικά, απλώς αλλάξτε το `ocr_engine.language` στο αντίστοιχο enum (π.χ., `aocr.Language.French`). Ο ίδιος AI post‑processor θα εφαρμόσει ακόμη και έλεγχο ορθογραφίας, αλλά ίσως θέλετε ένα μοντέλο ειδικά για τη γλώσσα για βέλτιστα αποτελέσματα. + +--- + +## Ανακεφαλαίωση & Επόμενα Βήματα + +Καλύψαμε ολόκληρη τη ροή για **πώς να εκτελέσετε OCR σε παρτίδες**: + +1. **Αρχικοποίηση** μιας μηχανής OCR απλού κειμένου για τα Αγγλικά. +2. **Διαμόρφωση** μοντέλου ελέγχου ορθογραφίας AI και σύνδεσή του ως post‑processor. +3. **Εκτέλεση** OCR σε κάθε εικόνα και αφήστε το AI να **διορθώσει αυτόματα τα σφάλματα OCR**. +4. **Βρόχος** σε έναν φάκελο για **μαζική εξαγωγή κειμένου από εικόνες**. +5. **Απελευθέρωση** των πόρων AI μόλις ολοκληρωθεί η εργασία. + +Από εδώ μπορείτε: + +- Να διοχετεύσετε το διορθωμένο κείμενο σε μια επόμενη pipeline NLP (ανάλυση συναισθήματος, εξαγωγή οντοτήτων κ.λπ.). +- Να αντικαταστήσετε το post‑processor ελέγχου ορθογραφίας με έναν προσαρμοσμένο summarizer καλώντας `ai_processor.set_post_processor(your_custom_func, {})`. +- Να παραλληλοποιήσετε τον βρόχο φακέλου με `concurrent.futures.ThreadPoolExecutor` αν η GPU σας μπορεί να διαχειριστεί πολλαπλές ροές. + +--- + +## Τελευταίες Σκέψεις + +Η παρτίδα OCR δεν χρειάζεται να είναι κουραστική. Εκμεταλλευόμενοι το Aspose OCR μαζί με ένα ελαφρύ μοντέλο AI, αποκτάτε μια **ολοκληρωμένη λύση** που **εξάγει κείμενο από εικόνες**, **εφαρμόζει έλεγχο ορθογραφίας**, **διορθώνει σφάλματα OCR**, και **απελευθερώνει πόρους AI** με καθαρό τρόπο. Δοκιμάστε το script σε έναν δοκιμαστικό φάκελο, ρυθμίστε τον αριθμό επιπέδων GPU ώστε να ταιριάζει στο υλικό σας, και θα έχετε μια παραγωγική pipeline σε λίγα λεπτά. + +Έχετε ερωτήσεις για την προσαρμογή του μοντέλου, τη διαχείριση PDF, ή την ενσωμάτωση σε web service; Αφήστε ένα σχόλιο παρακάτω ή επικοινωνήστε μαζί μου στο GitHub. Καλό κώδικα, και να είναι το OCR σας πάντα ακριβές! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/greek/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/greek/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..10e2f6e12 --- /dev/null +++ b/ocr/greek/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,300 @@ +--- +category: general +date: 2026-05-03 +description: Μάθημα Python OCR που δείχνει πώς να φορτώνετε αρχεία εικόνας PNG, να + αναγνωρίζετε κείμενο από την εικόνα και δωρεάν πόρους AI για ομαδική επεξεργασία + OCR. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: el +og_description: Το σεμινάριο Python OCR σας καθοδηγεί στη φόρτωση εικόνων PNG, στην + αναγνώριση κειμένου από την εικόνα και στη διαχείριση δωρεάν πόρων AI για επεξεργασία + OCR σε παρτίδες. +og_title: Python OCR Tutorial – Γρήγορη ομαδική OCR με δωρεάν πόρους AI +tags: +- OCR +- Python +- AI +title: Python OCR Tutorial – Εύκολη μαζική επεξεργασία OCR +url: /el/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Οδηγός Python OCR – Εύκολη Επεξεργασία OCR σε Παρτίδες + +Κάποτε χρειάστηκε ένας **python ocr tutorial** που πραγματικά να σας επιτρέπει να τρέχετε OCR σε δεκάδες αρχεία PNG χωρίς να τρελαίνεστε; Δεν είστε μόνοι. Σε πολλά πραγματικά έργα πρέπει να **load png image** αρχεία, να τα δίνετε σε μια μηχανή, και μετά να καθαρίζετε τους πόρους AI όταν τελειώσετε. + +Σε αυτόν τον οδηγό θα περάσουμε βήμα‑βήμα από ένα πλήρες, έτοιμο‑για‑εκτέλεση παράδειγμα που δείχνει ακριβώς πώς να **recognize text from image** αρχεία, να τα επεξεργαστείτε σε παρτίδες, και να ελευθερώσετε τη μνήμη AI. Στο τέλος θα έχετε ένα αυτόνομο script που μπορείτε να ενσωματώσετε σε οποιοδήποτε έργο—χωρίς περιττά πρόσθετα, μόνο τα ουσιώδη. + +## Τι Θα Χρειαστείτε + +- Python 3.10 ή νεότερη (η σύνταξη που χρησιμοποιείται εδώ βασίζεται σε f‑strings και type hints) +- Μια βιβλιοθήκη OCR που εκθέτει μέθοδο `engine.recognize` – για σκοπούς επίδειξης θα υποθέσουμε ένα φανταστικό πακέτο `aocr`, αλλά μπορείτε να το αντικαταστήσετε με Tesseract, EasyOCR κ.λπ. +- Το βοηθητικό module `ai` που φαίνεται στο απόσπασμα κώδικα (διαχειρίζεται την αρχικοποίηση του μοντέλου και τον καθαρισμό πόρων) +- Έναν φάκελο γεμάτο PNG αρχεία που θέλετε να επεξεργαστείτε + +Αν δεν έχετε εγκατεστημένα τα `aocr` ή `ai`, μπορείτε να τα μιμηθείτε με stubs – δείτε την ενότητα «Optional Stubs» στο τέλος. + +## Βήμα 1: Αρχικοποίηση της Μηχανής AI (Free AI Resources) + +Πριν δώσετε οποιαδήποτε εικόνα στη γραμμή OCR, το υποκείμενο μοντέλο πρέπει να είναι έτοιμο. Η αρχικοποίηση μόνο μία φορά εξοικονομεί μνήμη και επιταχύνει τις εργασίες παρτίδας. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Γιατί είναι σημαντικό:** +Η κλήση του `ai.initialize` επανειλημμένα για κάθε εικόνα θα κατανέμιζε μνήμη GPU ξανά‑ξανά, οδηγώντας τελικά σε κατάρρευση του script. Με τον έλεγχο `ai.is_initialized()` διασφαλίζουμε μία μόνο κατανομή – αυτό είναι η αρχή του «Free AI resources». + +## Βήμα 2: Φόρτωση Αρχείων PNG για Επεξεργασία OCR σε Παρτίδες + +Τώρα συγκεντρώνουμε όλα τα PNG αρχεία που θέλουμε να τρέξουμε μέσω OCR. Η χρήση του `pathlib` κρατά τον κώδικα ανεξάρτητο από το λειτουργικό σύστημα. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Ακρόαση περιπτώσεων:** +Αν ο φάκελος περιέχει αρχεία που δεν είναι PNG (π.χ. JPEG), θα αγνοηθούν, αποτρέποντας το `engine.recognize` από το να «πνίγει» σε μη υποστηριζόμενη μορφή. + +## Βήμα 3: Εκτέλεση OCR σε Κάθε Εικόνα και Εφαρμογή Μετα‑επεξεργασίας + +Με τη μηχανή έτοιμη και τη λίστα αρχείων προετοιμασμένη, μπορούμε να διασχίσουμε τις εικόνες, να εξάγουμε ακατέργαστο κείμενο, και να το περάσουμε σε μετα‑επεξεργαστή που καθαρίζει κοινά artefacts OCR (όπως ανεπιθύμητες αλλαγές γραμμής). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Γιατί χωρίζουμε τη φόρτωση από την αναγνώριση:** +Η `aocr.Image.load` μπορεί να κάνει lazy decoding, κάτι που είναι γρηγορότερο για μεγάλες παρτίδες. Η ρητή φάση φόρτωσης κάνει επίσης εύκολο το swap σε διαφορετική βιβλιοθήκη εικόνας αν αργότερα χρειαστεί να υποστηρίξετε JPEG ή TIFF. + +## Βήμα 4: Καθαρισμός – Ελευθέρωση Πόρων AI μετά την Παρτίδα + +Μόλις ολοκληρωθεί η παρτίδα, πρέπει να απελευθερώσουμε το μοντέλο για να αποφύγουμε διαρροές μνήμης, ειδικά σε μηχανές με GPU. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Συνδυασμός Όλων – Το Πλήρες Script + +Παρακάτω υπάρχει ένα μοναδικό αρχείο που ενώνει τα τέσσερα βήματα σε μια συνεκτική ροή εργασίας. Αποθηκεύστε το ως `batch_ocr.py` και τρέξτε το από τη γραμμή εντολών. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Αναμενόμενο Αποτέλεσμα + +Η εκτέλεση του script σε φάκελο με τρία PNG μπορεί να εμφανίσει: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +Το αρχείο `ocr_results.txt` θα περιέχει έναν σαφή διαχωριστή για κάθε εικόνα, ακολουθούμενο από το καθαρισμένο κείμενο OCR. + +## Optional Stubs για aocr & ai (Αν Δεν Έχετε Πραγματικά Πακέτα) + +Αν θέλετε απλώς να δοκιμάσετε τη ροή χωρίς βαριές βιβλιοθήκες OCR, μπορείτε να δημιουργήσετε ελάχιστα mock modules: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Τοποθετήστε αυτούς τους φακέλους δίπλα στο `batch_ocr.py` και το script θα τρέξει, εκτυπώνοντας ψεύτικα αποτελέσματα. + +## Pro Tips & Συνηθισμένα Πιθανά Σφάλματα + +- **Αιχμές μνήμης:** Αν επεξεργάζεστε χιλιάδες PNG υψηλής ανάλυσης, σκεφτείτε να τα μειώσετε πριν το OCR. Η `aocr.Image.load` συχνά δέχεται όρισμα `max_size`. +- **Διαχείριση Unicode:** Πάντα ανοίγετε το αρχείο εξόδου με `encoding="utf-8"`· οι μηχανές OCR μπορούν να εκπονήσουν μη‑ASCII χαρακτήρες. +- **Παράλληλη εκτέλεση:** Για OCR που περιορίζεται από CPU, μπορείτε να τυλίξετε το `ocr_batch` σε `concurrent.futures.ThreadPoolExecutor`. Θυμηθείτε όμως να διατηρείτε μία μόνο παρουσία `ai` – η δημιουργία πολλών νημάτων που καλούν `ai.initialize` αντιστέκεται στον στόχο «Free AI resources». +- **Ανθεκτικότητα σε σφάλματα:** Τυλίξτε τον βρόχο ανά‑εικόνα σε `try/except` ώστε ένα κατεστραμμένο PNG να μην διακόψει όλη τη παρτίδα. + +## Συμπέρασμα + +Τώρα έχετε έναν **python ocr tutorial** που δείχνει πώς να **load png image** αρχεία, να εκτελείτε **batch OCR processing**, και να διαχειρίζεστε υπεύθυνα **free AI resources**. Το πλήρες, εκτελέσιμο παράδειγμα δείχνει ακριβώς πώς να **recognize text from image** αντικείμενα και να καθαρίζετε μετά, ώστε να το αντιγράψετε‑επικολλήσετε στα δικά σας έργα χωρίς να ψάχνετε για ελλείποντα κομμάτια. + +Έτοιμοι για το επόμενο βήμα; Δοκιμάστε να αντικαταστήσετε τα stub `aocr` και `ai` με πραγματικές βιβλιοθήκες όπως `pytesseract` και `torchvision`. Μπορείτε επίσης να επεκτείνετε το script ώστε να εξάγει JSON, να στέλνει αποτελέσματα σε βάση δεδομένων, ή να ενσωματώνεται με cloud storage bucket. Ο ουρανός είναι το όριο—καλή προγραμματιστική! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/greek/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/greek/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..13712aa16 --- /dev/null +++ b/ocr/greek/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: Μάθετε πώς να εκτελείτε OCR σε εικόνα και να εξάγετε κείμενο με συντεταγμένες + χρησιμοποιώντας δομημένη αναγνώριση OCR. Περιλαμβάνεται βήμα‑βήμα κώδικας Python. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: el +og_description: Εκτελέστε OCR σε εικόνα και λάβετε το κείμενο με συντεταγμένες χρησιμοποιώντας + δομημένη αναγνώριση OCR. Πλήρες παράδειγμα Python με επεξηγήσεις. +og_title: Εκτέλεση OCR σε εικόνα – Εκπαιδευτικό για εξαγωγή δομημένου κειμένου +tags: +- OCR +- Python +- Computer Vision +title: Εκτέλεση OCR σε εικόνα – Πλήρης οδηγός για εξαγωγή δομημένου κειμένου +url: /el/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Εκτέλεση OCR σε εικόνα – Πλήρης Οδηγός για Εξαγωγή Δομημένου Κειμένου + +Κάποτε χρειάστηκε να **run OCR on image** αρχεία αλλά δεν ήξερες πώς να διατηρήσεις τις ακριβείς θέσεις κάθε λέξης; Δεν είσαι μόνος σου. Σε πολλά έργα—σάρωση αποδείξεων, ψηφιοποίηση φορμών ή δοκιμές UI—χρειάζεσαι όχι μόνο το ακατέργαστο κείμενο αλλά και τα bounding boxes που δείχνουν πού βρίσκεται κάθε γραμμή στην εικόνα. + +Αυτό το tutorial σου δείχνει έναν πρακτικό τρόπο να *run OCR on image* χρησιμοποιώντας τη μηχανή **aocr**, να ζητήσεις **structured OCR recognition**, και στη συνέχεια να κάνεις post‑processing του αποτελέσματος διατηρώντας τη γεωμετρία. Στο τέλος θα μπορείς να **extract text with coordinates** σε λίγες γραμμές Python και θα καταλάβεις γιατί η δομημένη λειτουργία είναι σημαντική για επόμενα βήματα. + +## Τι Θα Μάθεις + +- Πώς να αρχικοποιήσεις τη μηχανή OCR για **structured OCR recognition**. +- Πώς να τροφοδοτήσεις μια εικόνα και να λάβεις ακατέργαστα αποτελέσματα που περιλαμβάνουν τα όρια των γραμμών. +- Πώς να τρέξεις έναν post‑processor που καθαρίζει το κείμενο χωρίς να χάνει τη γεωμετρία. +- Πώς να επαναλάβεις τις τελικές γραμμές και να εκτυπώσεις κάθε κομμάτι κειμένου μαζί με το bounding box του. + +Καμία μαγεία, κανένα κρυφό βήμα—απλώς ένα πλήρες, εκτελέσιμο παράδειγμα που μπορείς να ενσωματώσεις στο δικό σου έργο. + +--- + +## Προαπαιτούμενα + +Πριν ξεκινήσουμε, βεβαιώσου ότι έχεις εγκαταστήσει τα παρακάτω: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Θα χρειαστείς επίσης ένα αρχείο εικόνας (`input_image.png` ή `.jpg`) που περιέχει καθαρό, αναγνώσιμο κείμενο. Οτιδήποτε από μια σαρωμένη τιμολόγηση μέχρι ένα screenshot λειτουργεί, εφόσον η μηχανή OCR μπορεί να δει τους χαρακτήρες. + +--- + +## Βήμα 1: Αρχικοποίηση της μηχανής OCR για δομημένη αναγνώριση + +Το πρώτο που κάνουμε είναι να δημιουργήσουμε μια παρουσία του `aocr.Engine()` και να του πούμε ότι θέλουμε **structured OCR recognition**. Η δομημένη λειτουργία επιστρέφει όχι μόνο το απλό κείμενο αλλά και γεωμετρικά δεδομένα (bounding rectangles) για κάθε γραμμή, κάτι που είναι απαραίτητο όταν χρειάζεται να χαρτογραφήσεις το κείμενο πίσω στην εικόνα. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Γιατί είναι σημαντικό:** +> Στην προεπιλεγμένη λειτουργία η μηχανή μπορεί να σου δώσει μόνο μια συμβολοσειρά ενωμένων λέξεων. Η δομημένη λειτουργία σου παρέχει μια ιεραρχία σελίδες → γραμμές → λέξεις, καθεμία με συντεταγμένες, κάνοντας πολύ πιο εύκολο το overlay των αποτελεσμάτων στην αρχική εικόνα ή την τροφοδοσία τους σε μοντέλο που λαμβάνει υπόψη τη διάταξη. + +--- + +## Βήμα 2: Εκτέλεση OCR στην εικόνα και λήψη ακατέργαστων αποτελεσμάτων + +Τώρα τροφοδοτούμε την εικόνα στη μηχανή. Η κλήση `recognize` επιστρέφει ένα αντικείμενο `OcrResult` που περιέχει μια συλλογή γραμμών, καθεμία με το δικό της bounding rectangle. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +Σε αυτό το σημείο το `raw_result.lines` περιέχει αντικείμενα με δύο σημαντικά χαρακτηριστικά: + +- `text` – η αναγνωρισμένη συμβολοσειρά για αυτή τη γραμμή. +- `bounds` – ένα tuple όπως `(x, y, width, height)` που περιγράφει τη θέση της γραμμής. + +--- + +## Βήμα 3: Post‑process διατηρώντας τη γεωμετρία + +Τα ακατέργαστα αποτελέσματα OCR είναι συχνά θορυβώδη: περιττοί χαρακτήρες, λανθασμένα κενά ή προβλήματα line‑break. Η συνάρτηση `ai.run_postprocessor` καθαρίζει το κείμενο αλλά **διατηρεί την αρχική γεωμετρία** άθικτη, ώστε να έχεις ακόμη ακριβείς συντεταγμένες. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Pro tip:** Αν έχεις λεξικά ειδικά για το domain σου (π.χ. κωδικοί προϊόντων), δώσε ένα προσαρμοσμένο λεξικό στον post‑processor για να βελτιώσεις την ακρίβεια. + +--- + +## Βήμα 4: Εξαγωγή κειμένου με συντεταγμένες – επανάληψη και εμφάνιση + +Τέλος, κάνουμε βρόχο πάνω από τις καθαρισμένες γραμμές, εκτυπώνοντας το bounding box της κάθε γραμμής μαζί με το κείμενό της. Αυτό είναι το κεντρικό μέρος του **extract text with coordinates**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Αναμενόμενο Αποτέλεσμα + +Υποθέτοντας ότι η είσοδος εικόνα περιέχει δύο γραμμές: “Invoice #12345” και “Total: $89.99”, θα δεις κάτι σαν: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +Το πρώτο tuple είναι το `(x, y, width, height)` της γραμμής στην αρχική εικόνα, επιτρέποντάς σου να σχεδιάσεις ορθογώνια, να επισημάνεις κείμενο ή να τροφοδοτήσεις τις συντεταγμένες σε άλλο σύστημα. + +--- + +## Οπτικοποίηση του Αποτελέσματος (Προαιρετικό) + +Αν θέλεις να δεις τα bounding boxes πάνω στην εικόνα, μπορείς να χρησιμοποιήσεις το Pillow (PIL) για να σχεδιάσεις ορθογώνια. Παρακάτω υπάρχει ένα γρήγορο snippet· μπορείς να το παραλείψεις αν χρειάζεσαι μόνο τα ακατέργαστα δεδομένα. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image example showing bounding boxes](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +Το alt κείμενο παραπάνω περιέχει τη **primary keyword**, ικανοποιώντας την απαίτηση SEO για τα alt attributes των εικόνων. + +--- + +## Γιατί η Structured OCR Recognition Υπερισχύει της Απλής Εξαγωγής Κειμένου + +Μπορεί να αναρωτιέσαι, “Μπορώ απλώς να τρέξω OCR και να πάρω το κείμενο; Γιατί να ασχοληθώ με τη γεωμετρία;” + +- **Χωρικό πλαίσιο:** Όταν χρειάζεται να τοποθετήσεις πεδία σε μια φόρμα (π.χ. “Date” δίπλα σε μια τιμή ημερομηνίας), οι συντεταγμένες σου λένε *πού* βρίσκεται το δεδομένο. +- **Διατάξεις πολλαπλών στηλών:** Το απλό γραμμικό κείμενο χάνει τη σειρά· τα δομημένα δεδομένα διατηρούν τη σειρά των στηλών. +- **Ακρίβεια post‑processing:** Η γνώση του μεγέθους του κουτιού σε βοηθά να αποφασίσεις αν μια λέξη είναι κεφαλίδα, υποσημείωση ή τυχαίο απόσπασμα. + +Συνοπτικά, η **structured OCR recognition** σου δίνει την ευελιξία να χτίσεις πιο έξυπνες pipelines—είτε τροφοδοτείς δεδομένα σε βάση, δημιουργείς αναζητήσιμα PDF ή εκπαιδεύεις μοντέλο μηχανικής μάθησης που σέβεται τη διάταξη. + +--- + +## Συνηθισμένες Ακραίες Περιπτώσεις και Πώς να τις Αντιμετωπίσεις + +| Κατάσταση | Σε τι Πρέπει να Προσέχεις | Προτεινόμενη Λύση | +|-----------|---------------------------|-------------------| +| **Περιστροφές ή κλίσεις εικόνας** | Τα bounding boxes μπορεί να είναι εκτός άξονα. | Προεπεξεργασία με deskewing (π.χ. `warpAffine` του OpenCV). | +| **Πολύ μικρές γραμματοσειρές** | Η μηχανή μπορεί να χάσει χαρακτήρες, οδηγώντας σε κενές γραμμές. | Αυξήστε την ανάλυση της εικόνας ή χρησιμοποιήστε `ocr_engine.set_dpi(300)`. | +| **Μικτές γλώσσες** | Λάθος μοντέλο γλώσσας μπορεί να παράγει ακατάληπτο κείμενο. | Ορίστε `ocr_engine.language = ["en", "de"]` πριν την αναγνώριση. | +| **Αλληλοεπικαλυπτόμενα κουτιά** | Ο post‑processor μπορεί να συγχωνεύσει δύο γραμμές ακούσια. | Επαληθεύστε `line.bounds` μετά την επεξεργασία· προσαρμόστε τα όρια στο `ai.run_postprocessor`. | + +Η αντιμετώπιση αυτών των σεναρίων νωρίς αποτρέπει προβλήματα αργότερα, ειδικά όταν κλιμακώνεις τη λύση σε εκατοντάδες έγγραφα την ημέρα. + +--- + +## Πλήρες Script End‑to‑End + +Παρακάτω βρίσκεται το ολοκληρωμένο, έτοιμο‑για‑εκτέλεση πρόγραμμα που ενώνει όλα τα βήματα. Αντέγραψε‑επικόλλησε, προσαρμόζε το μονοπάτι της εικόνας, και είσαι έτοιμος. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Η εκτέλεση αυτού του script: + +1. **Run OCR on image** με δομημένη λειτουργία. +2. **Extract text with coordinates** για κάθε γραμμή. +3. Προαιρετικά παράγει ένα PNG με τα annotations. + +--- + +## Συμπέρασμα + +Τώρα διαθέτεις μια ολοκληρωμένη, αυτόνομη λύση για **run OCR on image** και **extract text with coordinates** χρησιμοποιώντας **structured OCR recognition**. Ο κώδικας δείχνει κάθε βήμα—from την αρχικοποίηση της μηχανής μέχρι το post‑processing και την οπτική επαλήθευση—ώστε να το προσαρμόσεις σε αποδείξεις, φόρμες ή οποιοδήποτε οπτικό έγγραφο που απαιτεί ακριβή εντοπισμό κειμένου. + +Τι θα κάνεις μετά; Δοκίμασε να αντικαταστήσεις τη μηχανή `aocr` με άλλη βιβλιοθήκη (Tesseract, EasyOCR) και δες πώς διαφέρουν τα δομημένα αποτελέσματά τους. Πειραματίσου με διαφορετικές στρατηγικές post‑processing, όπως ορθογραφικό έλεγχο ή προσαρμοσμένα regex φίλτρα, για να αυξήσεις την ακρίβεια στο domain σου. Και αν χτίζεις μεγαλύτερη pipeline, σκέψου την αποθήκευση των ζευγών `(text, bounds)` σε βάση δεδομένων για μελλοντική ανάλυση. + +Καλή προγραμματιστική δουλειά, και οι OCR προσπάθειές σου να είναι πάντα ακριβείς! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/hindi/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/hindi/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..daac706b7 --- /dev/null +++ b/ocr/hindi/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,210 @@ +--- +category: general +date: 2026-05-03 +description: Aspose OCR और AI स्पेल‑चेक का उपयोग करके छवि से टेक्स्ट निकालें। सीखें + कि कैसे छवि को OCR करें, OCR के लिए छवि लोड करें, इनवॉइस से टेक्स्ट पहचानें और GPU + संसाधनों को रिलीज़ करें। +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: hi +og_description: Aspose OCR और AI वर्तनी‑जाँच के साथ छवि से टेक्स्ट निकालें। चरण‑दर‑चरण + गाइड जिसमें छवि को OCR करने, OCR के लिए छवि लोड करने और GPU संसाधनों को रिलीज़ करने + की प्रक्रिया शामिल है। +og_title: छवि से टेक्स्ट निकालें – पूर्ण OCR और स्पेल‑चेक गाइड +tags: +- OCR +- Aspose +- AI +- Python +title: छवि से पाठ निकालें – Aspose AI स्पेल‑चेक के साथ OCR +url: /hi/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# छवि से टेक्स्ट निकालें – Complete OCR & Spell‑Check Guide + +क्या आपको कभी **छवि से टेक्स्ट निकालें** की जरूरत पड़ी है लेकिन आप सुनिश्चित नहीं थे कि कौन सी लाइब्रेरी आपको गति और सटीकता दोनों देगी? आप अकेले नहीं हैं। कई वास्तविक‑दुनिया प्रोजेक्ट्स में—जैसे इनवॉइस प्रोसेसिंग, रसीद डिजिटाइज़ेशन, या अनुबंध स्कैनिंग—एक तस्वीर से साफ़, खोजने योग्य टेक्स्ट प्राप्त करना पहला बाधा है। + +अच्छी खबर यह है कि Aspose OCR को एक हल्के Aspose AI मॉडल के साथ जोड़ने से यह काम कुछ ही पायथन लाइनों में किया जा सकता है। इस ट्यूटोरियल में हम **छवि को OCR कैसे करें**, चित्र को सही तरीके से लोड करना, बिल्ट‑इन स्पेल‑चेक पोस्ट‑प्रोसेसर चलाना, और अंत में **GPU संसाधनों को रिलीज़ करें** ताकि आपका ऐप मेमोरी‑फ्रेंडली बना रहे। + +इस गाइड के अंत तक आप **इनवॉइस से टेक्स्ट पहचानने** वाली छवियों को पहचान सकेंगे, सामान्य OCR त्रुटियों को स्वचालित रूप से सुधारेंगे, और अगली बैच के लिए अपना GPU साफ़ रखेंगे। + +## आपको क्या चाहिए + +- Python 3.9 या नया (कोड टाइप हिंट्स का उपयोग करता है लेकिन पहले के 3.x संस्करणों पर भी काम करता है) +- `aspose-ocr` और `aspose-ai` पैकेज (इंस्टॉल करने के लिए `pip install aspose-ocr aspose-ai`) +- CUDA‑सक्षम GPU वैकल्पिक है; यदि नहीं मिला तो स्क्रिप्ट CPU पर फ़ॉल बैक हो जाएगी। +- एक उदाहरण छवि, जैसे `sample_invoice.png`, को किसी फ़ोल्डर में रखें जिसे आप संदर्भित कर सकें। + +कोई भारी ML फ्रेमवर्क नहीं, कोई बड़े मॉडल डाउनलोड नहीं—सिर्फ एक छोटा Q4‑K‑M क्वांटाइज़्ड मॉडल जो अधिकांश GPUs पर आराम से फिट हो जाता है। + +## चरण 1: OCR इंजन को इनिशियलाइज़ करें – छवि से टेक्स्ट निकालें + +सबसे पहले आप एक `OcrEngine` इंस्टेंस बनाते हैं और उसे बताते हैं कि आप कौन सी भाषा अपेक्षित है। यहाँ हम अंग्रेज़ी चुनते हैं और प्लेन‑टेक्स्ट आउटपुट का अनुरोध करते हैं, जो डाउनस्ट्रीम प्रोसेसिंग के लिए आदर्श है। + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**यह क्यों महत्वपूर्ण है:** भाषा सेट करने से कैरेक्टर सेट सीमित हो जाता है, जिससे सटीकता बढ़ती है। प्लेन‑टेक्स्ट मोड लेआउट जानकारी को हटाता है जो आमतौर पर आपको तब नहीं चाहिए जब आप सिर्फ छवि से टेक्स्ट निकालना चाहते हैं। + +## चरण 2: OCR के लिए छवि लोड करें – छवि को OCR कैसे करें + +अब हम इंजन को एक वास्तविक चित्र देते हैं। `Image.load` हेल्पर सामान्य फ़ॉर्मैट (PNG, JPEG, TIFF) को समझता है और फ़ाइल‑IO की जटिलताओं को एब्स्ट्रैक्ट करता है। + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**टिप:** यदि आपके स्रोत चित्र बड़े हैं, तो उन्हें इंजन को भेजने से पहले रीसाइज़ करने पर विचार करें; छोटे आकार GPU मेमोरी उपयोग को कम कर सकते हैं बिना पहचान की गुणवत्ता को नुकसान पहुँचाए। + +## चरण 3: Aspose AI मॉडल को कॉन्फ़िगर करें – इनवॉइस से टेक्स्ट पहचानें + +Aspose AI एक छोटे GGUF मॉडल के साथ आता है जिसे आप ऑटो‑डownload कर सकते हैं। उदाहरण में `Qwen2.5‑3B‑Instruct‑GGUF` रिपॉजिटरी का उपयोग किया गया है, जो `q4_k_m` में क्वांटाइज़्ड है। हम रनटाइम को भी बताते हैं कि GPU पर 20 लेयर आवंटित करे, जो गति और VRAM उपयोग के बीच संतुलन बनाता है। + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**पर्दे के पीछे:** क्वांटाइज़्ड मॉडल डिस्क पर लगभग 1.5 GB का है, जो फुल‑प्रिसिजन मॉडल का एक छोटा हिस्सा है, फिर भी यह सामान्य OCR गलतियों को पहचानने के लिए पर्याप्त भाषाई बारीकियों को पकड़ता है। + +## चरण 4: AsposeAI को इनिशियलाइज़ करें और स्पेल‑चेक पोस्ट‑प्रोसेसर संलग्न करें + +Aspose AI में एक तैयार‑निर्मित स्पेल‑चेक पोस्ट‑प्रोसेसर शामिल है। इसे संलग्न करने से, हर OCR परिणाम स्वचालित रूप से साफ़ हो जाएगा। + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**पोस्ट‑प्रोसेसर क्यों उपयोग करें?** OCR इंजन अक्सर “Invoice” को “Invo1ce” या “Total” को “T0tal” पढ़ लेते हैं। स्पेल‑चेक एक हल्के भाषा मॉडल को कच्ची स्ट्रिंग पर चलाता है और उन त्रुटियों को ठीक करता है बिना आपको कस्टम डिक्शनरी लिखे। + +## चरण 5: OCR परिणाम पर स्पेल‑चेक पोस्ट‑प्रोसेसर चलाएँ + +सब कुछ कनेक्ट हो जाने पर, एक ही कॉल से सुधरा हुआ टेक्स्ट मिलता है। हम मूल और साफ़ किए गए दोनों संस्करण प्रिंट करते हैं ताकि आप सुधार देख सकें। + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +इनवॉइस के लिए सामान्य आउटपुट कुछ इस प्रकार दिख सकता है: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +ध्यान दें कि “Invo1ce” सही शब्द “Invoice” में बदल गया। यही बिल्ट‑इन AI स्पेल‑चेक की शक्ति है। + +## चरण 6: GPU संसाधनों को रिलीज़ करें – GPU संसाधनों को सुरक्षित रूप से रिलीज़ करें + +यदि आप इसे एक दीर्घकालिक सेवा में चला रहे हैं (जैसे, एक वेब API जो प्रति मिनट दर्जनों इनवॉइस प्रोसेस करता है), तो प्रत्येक बैच के बाद GPU कॉन्टेक्स्ट को फ्री करना आवश्यक है। अन्यथा आपको मेमोरी लीक दिखेगा और अंततः “CUDA out of memory” त्रुटियां मिलेंगी। + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**प्रो टिप:** `free_resources()` को एक `finally` ब्लॉक या कंटेक्स्ट मैनेजर के अंदर कॉल करें ताकि यह हमेशा चले, चाहे कोई अपवाद हो या न हो। + +## पूर्ण कार्यशील उदाहरण + +सभी हिस्सों को मिलाकर आपको एक स्व-निहित स्क्रिप्ट मिलती है जिसे आप किसी भी प्रोजेक्ट में डाल सकते हैं। + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +फ़ाइल को सेव करें, अपनी छवि का पाथ समायोजित करें, और `python extract_text_from_image.py` चलाएँ। आपको कंसोल में साफ़ किया गया इनवॉइस टेक्स्ट प्रिंट होता दिखेगा। + +## अक्सर पूछे जाने वाले प्रश्न (FAQ) + +**प्रश्न:** क्या यह केवल CPU वाले मशीनों पर काम करता है? +**उत्तर:** बिल्कुल। यदि कोई GPU नहीं मिलता, तो Aspose AI CPU निष्पादन पर फ़ॉल बैक हो जाता है, हालांकि यह धीमा होगा। आप `model_cfg.gpu_layers = 0` सेट करके CPU को मजबूर कर सकते हैं। + +**प्रश्न:** यदि मेरे इनवॉइस अंग्रेज़ी के अलावा किसी अन्य भाषा में हैं तो? +**उत्तर:** `ocr_engine.language` को उपयुक्त enum वैल्यू (जैसे, `aocr.Language.Spanish`) में बदलें। स्पेल‑चेक मॉडल बहुभाषी है, लेकिन भाषा‑विशिष्ट मॉडल के साथ बेहतर परिणाम मिल सकते हैं। + +**प्रश्न:** क्या मैं कई छवियों को लूप में प्रोसेस कर सकता हूँ? +**उत्तर:** हाँ। लोडिंग, पहचान, और पोस्ट‑प्रोसेसिंग स्टेप्स को `for` लूप के अंदर रखें। यदि आप वही AI इंस्टेंस पुनः उपयोग कर रहे हैं तो लूप के बाद या प्रत्येक बैच के बाद `ocr_ai.free_resources()` कॉल करना याद रखें। + +**प्रश्न:** मॉडल डाउनलोड का आकार कितना है? +**उत्तर:** क्वांटाइज़्ड `q4_k_m` संस्करण के लिए लगभग 1.5 GB। यह पहली बार चलाने के बाद कैश हो जाता है, इसलिए बाद के निष्पादन तुरंत होते हैं। + +## निष्कर्ष + +इस ट्यूटोरियल में हमने दिखाया कि कैसे Aspose OCR का उपयोग करके **छवि से टेक्स्ट निकालें**, एक छोटा AI मॉडल कॉन्फ़िगर करें, स्पेल‑चेक पोस्ट‑प्रोसेसर लागू करें, और सुरक्षित रूप से **GPU संसाधनों को रिलीज़ करें**। यह वर्कफ़्लो चित्र लोड करने से लेकर स्वयं को साफ़ करने तक सब कुछ कवर करता है, जिससे आपको **इनवॉइस से टेक्स्ट पहचानने** के परिदृश्यों के लिए एक विश्वसनीय पाइपलाइन मिलती है। + +अगला कदम? स्पेल‑चेक को एक कस्टम एंटिटी‑एक्सट्रैक्शन मॉडल से बदलने का प्रयास करें + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/hindi/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/hindi/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..c43a3f27a --- /dev/null +++ b/ocr/hindi/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,215 @@ +--- +category: general +date: 2026-05-03 +description: Aspose OCR और AI स्पेल‑चेक का उपयोग करके इमेजेज़ को बैच में OCR करने + का तरीका। इमेजेज़ से टेक्स्ट निकालना, स्पेल चेक लागू करना, मुफ्त AI संसाधनों का + उपयोग करना और OCR त्रुटियों को सुधारना सीखें। +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: hi +og_description: Aspose OCR और AI स्पेल‑चेक का उपयोग करके छवियों को बैच में OCR करने + का तरीका। छवियों से टेक्स्ट निकालने, स्पेल‑चेक लागू करने, मुफ्त AI संसाधनों का उपयोग + करने और OCR त्रुटियों को सुधारने के लिए चरण‑दर‑चरण गाइड का पालन करें। +og_title: Aspose OCR के साथ बैच OCR कैसे करें – पूर्ण पायथन ट्यूटोरियल +tags: +- OCR +- Python +- AI +- Aspose +title: Aspose OCR के साथ बैच OCR कैसे करें – पूर्ण Python गाइड +url: /hi/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Aspose OCR के साथ बैच OCR कैसे करें – पूर्ण Python गाइड + +क्या आप कभी सोचे हैं **how to batch OCR** को पूरी स्कैन की हुई PDFs या फ़ोटो की फ़ोल्डर पर बिना प्रत्येक फ़ाइल के लिए अलग स्क्रिप्ट लिखे लागू करना? आप अकेले नहीं हैं। कई वास्तविक‑दुनिया पाइपलाइन में आपको **extract text from images** करने की जरूरत पड़ेगी, वर्तनी की गलतियों को सुधारना होगा, और अंत में आपने जो AI संसाधन आवंटित किए हैं उन्हें मुक्त करना होगा। यह ट्यूटोरियल आपको बिल्कुल वही दिखाता है कि Aspose OCR, एक हल्का AI पोस्ट‑प्रोसेसर, और कुछ Python लाइनों के साथ यह कैसे किया जाए। + +हम OCR इंजन को इनिशियलाइज़ करने, AI स्पेल‑चेकर को जोड़ने, चित्रों की डायरेक्टरी पर लूप करने, और बाद में मॉडल को साफ़ करने की प्रक्रिया बताएँगे। अंत तक आपके पास एक तैयार‑चलाने‑योग्य स्क्रिप्ट होगी जो **corrects OCR errors** को स्वचालित रूप से ठीक करती है और **free AI resources** को रिलीज़ करती है ताकि आपका GPU खुश रहे। + +## आपको क्या चाहिए + +- Python 3.9+ (कोड type‑hints का उपयोग करता है लेकिन पहले के 3.x संस्करणों पर भी काम करता है) +- `asposeocr` पैकेज (`pip install asposeocr`) – यह OCR इंजन प्रदान करता है। +- Hugging Face मॉडल `bartowski/Qwen2.5-3B-Instruct-GGUF` तक पहुँच (स्वचालित रूप से डाउनलोड होता है)। +- एक GPU जिसमें कम से कम कुछ GB VRAM हो (स्क्रिप्ट `gpu_layers = 30` सेट करती है, आवश्यकता अनुसार इसे कम किया जा सकता है)। + +कोई बाहरी सेवाएँ नहीं, कोई पेड API नहीं – सब कुछ स्थानीय रूप से चलता है। + +--- + +## चरण 1: OCR इंजन सेट अप करें – **How to Batch OCR** को कुशलतापूर्वक + +हजारों छवियों को प्रोसेस करने से पहले हमें एक मजबूत OCR इंजन चाहिए। Aspose OCR हमें एक ही कॉल में भाषा और पहचान मोड चुनने की सुविधा देता है। + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Why this matters:** `recognize_mode` को `Plain` सेट करने से आउटपुट हल्का रहता है, जो बाद में स्पेल‑चेक चलाने की योजना होने पर आदर्श है। यदि आपको लेआउट जानकारी चाहिए तो आप `Layout` पर स्विच करेंगे, लेकिन इससे ओवरहेड बढ़ता है जिसे आप बैच जॉब में शायद नहीं चाहते। + +> **Pro tip:** यदि आप बहुभाषी स्कैन से निपट रहे हैं, तो आप एक सूची पास कर सकते हैं जैसे `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`। + +--- + +## चरण 2: AI पोस्ट‑प्रोसेसर को इनिशियलाइज़ करें – OCR आउटपुट पर **Apply Spell Check** लागू करें + +Aspose AI एक बिल्ट‑इन पोस्ट‑प्रोसेसर के साथ आता है जो आपकी पसंद का कोई भी मॉडल चला सकता है। यहाँ हम Hugging Face से एक क्वांटाइज़्ड Qwen 2.5 मॉडल लाते हैं और स्पेल‑चेक रूटीन को जोड़ते हैं। + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Why this matters:** मॉडल क्वांटाइज़्ड है (`q4_k_m`), जो मेमोरी उपयोग को काफी कम करता है जबकि फिर भी उचित भाषा समझ प्रदान करता है। `set_post_processor` को कॉल करके हम Aspose AI को बताते हैं कि वह किसी भी स्ट्रिंग पर **apply spell check** चरण को स्वचालित रूप से चलाए। + +> **Watch out:** यदि आपका GPU 30 लेयर्स संभाल नहीं सकता, तो संख्या को 15 या यहाँ तक कि 5 कर दें – स्क्रिप्ट अभी भी काम करेगी, बस थोड़ा धीमी होगी। + +--- + +## चरण 3: एक सिंगल इमेज पर OCR चलाएँ और **Correct OCR Errors** करें + +अब जब OCR इंजन और AI स्पेल‑चेकर दोनों तैयार हैं, हम उन्हें मिलाते हैं। यह फ़ंक्शन एक इमेज लोड करता है, कच्चा टेक्स्ट निकालता है, फिर AI पोस्ट‑प्रोसेसर को चलाकर उसे साफ़ करता है। + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Why this matters:** कच्ची OCR स्ट्रिंग को सीधे AI मॉडल में फ़ीड करने से हमें **correct OCR errors** पास मिलता है बिना किसी रेगेक्स या कस्टम डिक्शनरी लिखे। मॉडल संदर्भ समझता है, इसलिए वह “recieve” → “receive” जैसी और भी सूक्ष्म गलतियों को ठीक कर सकता है। + +--- + +## चरण 4: बड़े पैमाने पर **Extract Text from Images** – असली बैच लूप + +यहीं पर **how to batch OCR** का जादू दिखता है। हम एक डायरेक्टरी पर इटररेट करते हैं, असमर्थित फ़ाइलों को छोड़ते हैं, और प्रत्येक सुधारा हुआ आउटपुट एक `.txt` फ़ाइल में लिखते हैं। + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### अपेक्षित आउटपुट + +एक इमेज जिसमें वाक्य *“The quick brown fox jumps over the lazzy dog.”* है, आप एक टेक्स्ट फ़ाइल इस प्रकार देखेंगे: + +``` +The quick brown fox jumps over the lazy dog. +``` + +ध्यान दें कि दोहरी “z” स्वचालित रूप से ठीक हो गई – यही AI स्पेल‑चेक का काम है। + +**Why this matters:** OCR और AI ऑब्जेक्ट्स को **एक बार** बनाकर और पुन: उपयोग करके, हम प्रत्येक फ़ाइल के लिए मॉडल लोड करने के ओवरहेड से बचते हैं। यह स्केल पर **how to batch OCR** करने का सबसे कुशल तरीका है। + +--- + +## चरण 5: क्लीन अप – **Free AI Resources** को सही तरीके से मुक्त करें + +जब आप समाप्त कर लें, `free_resources()` को कॉल करने से GPU मेमोरी, CUDA कॉन्टेक्स्ट, और मॉडल द्वारा बनाए गए किसी भी टेम्पररी फ़ाइलों को मुक्त किया जाता है। + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +इस चरण को छोड़ने से लटके हुए GPU अलोकेशन रह सकते हैं, जो बाद के Python प्रोसेस को क्रैश कर सकते हैं या VRAM को खा सकते हैं। इसे बैच जॉब के “लाइट्स बंद करने” भाग के रूप में सोचें। + +--- + +## सामान्य समस्याएँ और अतिरिक्त टिप्स + +| Issue | What to Look For | Fix | +|-------|------------------|-----| +| **Out‑of‑memory errors** | GPU कुछ दर्जन इमेजेज़ के बाद समाप्त हो जाता है | `gpu_layers` को कम करें या CPU पर स्विच करें (`model_cfg.gpu_layers = 0`). | +| **Missing language pack** | OCR खाली स्ट्रिंग्स लौटाता है | `asposeocr` संस्करण में अंग्रेज़ी भाषा डेटा शामिल है, यह सुनिश्चित करें; आवश्यकता होने पर पुनः इंस्टॉल करें। | +| **Non‑image files** | स्क्रिप्ट एक बिखरी हुई `.pdf` पर क्रैश हो जाता है | `if not file_name.lower().endswith(...)` गार्ड पहले से ही उन्हें स्किप कर देता है। | +| **Spell‑check not applied** | आउटपुट कच्चे OCR जैसा ही दिखता है | लूप से पहले `ai_processor.set_post_processor` कॉल किया गया है, यह सुनिश्चित करें। | +| **Slow batch speed** | प्रति इमेज 5 सेकंड से अधिक लेता है | पहले रन के बाद `model_cfg.allow_auto_download = "false"` सक्षम करें, ताकि मॉडल हर बार पुनः डाउनलोड न हो। | + +**Pro tip:** यदि आपको अंग्रेज़ी के अलावा किसी अन्य भाषा में **extract text from images** करने की जरूरत है, तो बस `ocr_engine.language` को उपयुक्त enum में बदल दें (जैसे, `aocr.Language.French`)। वही AI पोस्ट‑प्रोसेसर अभी भी स्पेल‑चेक लागू करेगा, लेकिन सर्वोत्तम परिणामों के लिए आप भाषा‑विशिष्ट मॉडल चाहते हैं। + +--- + +## सारांश और अगले कदम + +हमने **how to batch OCR** के लिए पूरी पाइपलाइन को कवर किया है: + +1. **Initialize** एक plain‑text OCR इंजन अंग्रेज़ी के लिए। +2. **Configure** एक AI स्पेल‑चेक मॉडल और इसे पोस्ट‑प्रोसेसर के रूप में बाइंड करें। +3. **Run** प्रत्येक इमेज पर OCR और AI को **correct OCR errors** स्वचालित रूप से करने दें। +4. **Loop** एक डायरेक्टरी पर बड़े पैमाने पर **extract text from images** करने के लिए। +5. **Free AI resources** जब जॉब समाप्त हो जाए। + +यहाँ से आप कर सकते हैं: + +- सुधारे हुए टेक्स्ट को डाउनस्ट्रीम NLP पाइपलाइन (सेंटिमेंट एनालिसिस, एंटिटी एक्सट्रैक्शन, आदि) में पाइप करें। +- स्पेल‑चेक पोस्ट‑प्रोसेसर को `ai_processor.set_post_processor(your_custom_func, {})` कॉल करके एक कस्टम समरीज़र से बदलें। +- यदि आपका GPU कई स्ट्रीम संभाल सकता है, तो `concurrent.futures.ThreadPoolExecutor` के साथ फ़ोल्डर लूप को पैरललाइज़ करें। + +--- + +## अंतिम विचार + +OCR को बैच करना कोई कठिन काम नहीं होना चाहिए। Aspose OCR को एक हल्के AI मॉडल के साथ उपयोग करके, आपको एक **one‑stop solution** मिलता है जो **extracts text from images**, **applies spell check**, **corrects OCR errors**, और **frees AI resources** को साफ़-सुथरे ढंग से करता है। स्क्रिप्ट को एक टेस्ट फ़ोल्डर पर चलाएँ, अपने हार्डवेयर के अनुसार GPU लेयर काउंट को समायोजित करें, और आपके पास मिनटों में एक प्रोडक्शन‑रेडी पाइपलाइन होगी। + +क्या मॉडल को ट्यून करने, PDFs को हैंडल करने, या इसे वेब सर्विस में इंटीग्रेट करने के बारे में प्रश्न हैं? नीचे कमेंट छोड़ें या GitHub पर मुझे पिंग करें। कोडिंग का आनंद लें, और आपका OCR हमेशा सटीक रहे! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/hindi/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/hindi/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..6d6f1f659 --- /dev/null +++ b/ocr/hindi/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,300 @@ +--- +category: general +date: 2026-05-03 +description: Python OCR ट्यूटोरियल जो दिखाता है कि PNG इमेज फ़ाइलें कैसे लोड करें, + छवि से टेक्स्ट पहचानें और बैच OCR प्रोसेसिंग के लिए मुफ्त AI संसाधन प्रदान करता + है। +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: hi +og_description: Python OCR ट्यूटोरियल आपको PNG छवियों को लोड करने, छवि से टेक्स्ट + पहचानने और बैच OCR प्रोसेसिंग के लिए मुफ्त AI संसाधनों को संभालने के चरणों से परिचित + कराता है। +og_title: Python OCR ट्यूटोरियल – मुफ्त AI संसाधनों के साथ तेज़ बैच OCR +tags: +- OCR +- Python +- AI +title: पायथन OCR ट्यूटोरियल – बैच OCR प्रोसेसिंग को आसान बनाना +url: /hi/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR ट्यूटोरियल – बैच OCR प्रोसेसिंग आसान बनाएं + +क्या आपको कभी ऐसा **python ocr tutorial** चाहिए था जो वास्तव में आपको सैकड़ों PNG फ़ाइलों पर OCR चलाने दे बिना सिर दर्द के? आप अकेले नहीं हैं। कई वास्तविक‑दुनिया प्रोजेक्ट्स में आपको **load png image** फ़ाइलें लोड करनी पड़ती हैं, उन्हें इंजन में फीड करना होता है, और काम खत्म होने पर AI संसाधनों को साफ़ करना होता है। + +इस गाइड में हम एक पूर्ण, तुरंत चलने योग्य उदाहरण के माध्यम से दिखाएंगे कि कैसे **recognize text from image** फ़ाइलों को बैच में प्रोसेस किया जाए और अंतर्निहित AI मेमोरी को मुक्त किया जाए। अंत तक आपके पास एक स्व-समाहित स्क्रिप्ट होगी जिसे आप किसी भी प्रोजेक्ट में डाल सकते हैं—कोई अतिरिक्त झंझट नहीं, सिर्फ़ आवश्यक बातें। + +## What You’ll Need + +- Python 3.10 या नया (यहाँ उपयोग किया गया सिंटैक्स f‑strings और type hints पर निर्भर करता है) +- एक OCR लाइब्रेरी जो `engine.recognize` मेथड प्रदान करती हो – डेमो के लिए हम एक काल्पनिक `aocr` पैकेज मानेंगे, लेकिन आप इसे Tesseract, EasyOCR आदि से बदल सकते हैं +- कोड स्निपेट में दिखाया गया `ai` हेल्पर मॉड्यूल (यह मॉडल इनिशियलाइज़ेशन और रिसोर्स क्लीनअप संभालता है) +- वह फ़ोल्डर जिसमें आप प्रोसेस करना चाहते हैं कई PNG फ़ाइलें हों + +यदि आपके पास `aocr` या `ai` स्थापित नहीं है, तो आप उन्हें स्टब्स के साथ मॉक कर सकते हैं – अंत में “Optional Stubs” सेक्शन देखें। + +## Step 1: Initialize the AI Engine (Free AI Resources) + +कोई भी इमेज OCR पाइपलाइन में फीड करने से पहले, अंतर्निहित मॉडल तैयार होना चाहिए। केवल एक बार इनिशियलाइज़ करने से मेमोरी बचती है और बैच जॉब्स तेज़ होते हैं। + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**क्यों यह महत्वपूर्ण है:** +`ai.initialize` को प्रत्येक इमेज के लिए बार‑बार कॉल करने से GPU मेमोरी बार‑बार अलोकेट होगी, अंत में स्क्रिप्ट क्रैश हो जाएगी। `ai.is_initialized()` की जाँच करके हम एक ही अलोकेशन सुनिश्चित करते हैं – यही “free AI resources” सिद्धांत है। + +## Step 2: Load PNG Image Files for Batch OCR Processing + +अब हम सभी PNG फ़ाइलें इकट्ठा करते हैं जिन्हें OCR के माध्यम से चलाना है। `pathlib` का उपयोग कोड को OS‑अग्नॉस्टिक रखता है। + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**एज केस:** +यदि फ़ोल्डर में गैर‑PNG फ़ाइलें (जैसे JPEG) हों तो उन्हें अनदेखा किया जाएगा, जिससे `engine.recognize` किसी असमर्थित फ़ॉर्मेट पर अटक नहीं पाएगा। + +## Step 3: Run OCR on Each Image and Apply Post‑Processing + +इंजन तैयार है और फ़ाइल सूची तैयार है, अब हम इमेजेज़ पर लूप कर सकते हैं, कच्चा टेक्स्ट निकाल सकते हैं, और उसे पोस्ट‑प्रोसेसर को दे सकते हैं जो सामान्य OCR आर्टिफैक्ट्स (जैसे अनावश्यक लाइन ब्रेक) को साफ़ करता है। + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**हम लोडिंग को रिकग्निशन से अलग क्यों करते हैं:** +`aocr.Image.load` लेज़ी डिकोडिंग कर सकता है, जो बड़े बैच के लिए तेज़ है। लोड स्टेप को स्पष्ट रखने से बाद में JPEG या TIFF फ़ाइलों को संभालने के लिए किसी अलग इमेज लाइब्रेरी में स्विच करना आसान हो जाता है। + +## Step 4: Clean Up – Free AI Resources After the Batch + +बैच पूरा होने के बाद, हमें मॉडल को रिलीज़ करना चाहिए ताकि मेमोरी लीक्स न हों, विशेषकर GPU‑सक्षम मशीनों पर। + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Putting It All Together – The Complete Script + +नीचे एक सिंगल फ़ाइल है जो चारों स्टेप्स को एक सुसंगत वर्कफ़्लो में जोड़ती है। इसे `batch_ocr.py` के रूप में सेव करें और कमांड लाइन से चलाएँ। + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Expected Output + +तीन PNG फ़ाइलों वाले फ़ोल्डर के खिलाफ स्क्रिप्ट चलाने पर यह आउटपुट दे सकता है: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +`ocr_results.txt` फ़ाइल प्रत्येक इमेज के लिए एक स्पष्ट डिलिमिटर के साथ साफ़ किया गया OCR टेक्स्ट रखेगी। + +## Optional Stubs for aocr & ai (If You Don’t Have Real Packages) + +यदि आप भारी OCR लाइब्रेरीज़ को इम्पोर्ट किए बिना फ्लो टेस्ट करना चाहते हैं, तो आप न्यूनतम मॉक मॉड्यूल बना सकते हैं: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +इन फ़ोल्डरों को `batch_ocr.py` के बगल में रखें और स्क्रिप्ट चलाएगी, मॉक परिणाम प्रिंट करेगी। + +## Pro Tips & Common Pitfalls + +- **Memory spikes:** यदि आप हजारों हाई‑रिज़ॉल्यूशन PNG प्रोसेस कर रहे हैं, तो OCR से पहले उनका आकार बदलने पर विचार करें। `aocr.Image.load` अक्सर `max_size` आर्ग्यूमेंट स्वीकार करता है। +- **Unicode handling:** हमेशा आउटपुट फ़ाइल को `encoding="utf-8"` के साथ खोलें; OCR इंजन गैर‑ASCII कैरेक्टर्स भी आउटपुट कर सकते हैं। +- **Parallelism:** CPU‑बाउंड OCR के लिए आप `ocr_batch` को `concurrent.futures.ThreadPoolExecutor` में रैप कर सकते हैं। बस यह याद रखें कि एक ही `ai` इंस्टेंस रखें – कई थ्रेड्स जो प्रत्येक `ai.initialize` कॉल करते हैं, “free AI resources” लक्ष्य को नष्ट कर देते हैं। +- **Error resilience:** प्रति‑इमेज लूप को `try/except` ब्लॉक में रखें ताकि एक ही करप्ट PNG पूरी बैच को रोक न सके। + +## Conclusion + +अब आपके पास एक **python ocr tutorial** है जो दिखाता है कैसे **load png image** फ़ाइलें लोड करें, **batch OCR processing** करें, और जिम्मेदारी से **free AI resources** मैनेज करें। पूर्ण, रन करने योग्य उदाहरण ठीक‑ठीक दिखाता है कैसे **recognize text from image** ऑब्जेक्ट्स को प्रोसेस किया जाए और बाद में क्लीन‑अप किया जाए, ताकि आप इसे अपने प्रोजेक्ट्स में कॉपी‑पेस्ट कर सकें बिना किसी हिस्से की कमी की तलाश किए। + +अगले कदम के लिए तैयार हैं? स्टब्ड `aocr` और `ai` मॉड्यूल को वास्तविक लाइब्रेरीज़ जैसे `pytesseract` और `torchvision` से बदलें। आप स्क्रिप्ट को JSON आउटपुट, डेटाबेस में परिणाम पुश करना, या क्लाउड स्टोरेज बकेट के साथ इंटीग्रेट करने के लिए भी विस्तारित कर सकते हैं। संभावनाएँ अनंत हैं—हैप्पी कोडिंग! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/hindi/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/hindi/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..9e6b94699 --- /dev/null +++ b/ocr/hindi/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: छवि पर OCR चलाना और संरचित OCR मान्यता का उपयोग करके निर्देशांक के साथ + पाठ निकालना सीखें। चरण‑दर‑चरण Python कोड शामिल है। +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: hi +og_description: छवि पर OCR चलाएँ और संरचित OCR मान्यता का उपयोग करके निर्देशांक के + साथ पाठ प्राप्त करें। व्याख्याओं के साथ पूर्ण Python उदाहरण। +og_title: छवि पर OCR चलाएँ – संरचित पाठ निष्कर्षण ट्यूटोरियल +tags: +- OCR +- Python +- Computer Vision +title: इमेज पर OCR चलाएँ – संरचित टेक्स्ट निष्कर्षण के लिए पूर्ण गाइड +url: /hi/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Run OCR on image – Structured Text Extraction का पूरा गाइड + +क्या आपको कभी **run OCR on image** फ़ाइलों को प्रोसेस करना पड़ा है लेकिन शब्दों की सटीक पोज़िशन नहीं रख पाए? आप अकेले नहीं हैं। कई प्रोजेक्ट्स—जैसे रसीद स्कैनिंग, फ़ॉर्म डिजिटाइज़ेशन, या UI टेस्टिंग—में आपको न केवल कच्चा टेक्स्ट चाहिए बल्कि बाउंडिंग बॉक्स भी चाहिए जो बताता है कि प्रत्येक लाइन इमेज में कहाँ स्थित है। + +यह ट्यूटोरियल आपको **aocr** इंजन का उपयोग करके *run OCR on image* करने, **structured OCR recognition** का अनुरोध करने, और फिर परिणाम को जियोमेट्री बनाए रखते हुए पोस्ट‑प्रोसेस करने का व्यावहारिक तरीका दिखाता है। अंत तक आप केवल कुछ ही पायथन लाइनों में **coordinates के साथ टेक्स्ट एक्सट्रैक्ट** कर पाएँगे, और समझेंगे कि structured मोड डाउनस्ट्रीम टास्क्स के लिए क्यों महत्वपूर्ण है। + +## What You’ll Learn + +- **structured OCR recognition** के लिए OCR इंजन को इनिशियलाइज़ कैसे करें। +- इमेज फीड करके रॉ रिज़ल्ट कैसे प्राप्त करें जिसमें लाइन बाउंड्स शामिल हों। +- पोस्ट‑प्रोसेसर चलाकर टेक्स्ट को साफ़ करें बिना जियोमेट्री खोए। +- अंतिम लाइनों पर इटरेट करके प्रत्येक टेक्स्ट को उसके बाउंडिंग बॉक्स के साथ प्रिंट करें। + +कोई जादू नहीं, कोई छिपे हुए स्टेप नहीं—सिर्फ एक पूर्ण, रन करने योग्य उदाहरण जिसे आप अपने प्रोजेक्ट में डाल सकते हैं। + +--- + +## Prerequisites + +शुरू करने से पहले सुनिश्चित करें कि आपके पास निम्नलिखित इंस्टॉल्ड हैं: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +आपको एक इमेज फ़ाइल (`input_image.png` या `.jpg`) भी चाहिए जिसमें स्पष्ट, पढ़ने योग्य टेक्स्ट हो। स्कैन की गई इनवॉइस से लेकर स्क्रीनशॉट तक, जब तक OCR इंजन अक्षर देख सके, कोई दिक्कत नहीं। + +--- + +## Step 1: Initialise the OCR engine for structured recognition + +सबसे पहले हम `aocr.Engine()` का एक इंस्टेंस बनाते हैं और उसे बताते हैं कि हमें **structured OCR recognition** चाहिए। Structured मोड न केवल प्लेन टेक्स्ट बल्कि प्रत्येक लाइन के लिए ज्योमेट्रिक डेटा (बाउंडिंग रेक्टेंगल) भी रिटर्न करता है, जो इमेज पर टेक्स्ट को मैप करने के लिए आवश्यक है। + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Why this matters:** +> डिफ़ॉल्ट मोड में इंजन केवल शब्दों की एक स्ट्रिंग दे सकता है। Structured मोड पेज → लाइन → शब्द की हायरार्की देता है, प्रत्येक के साथ कोऑर्डिनेट्स, जिससे मूल इमेज पर परिणाम ओवरले करना या लेआउट‑अवेयर मॉडल में फीड करना बहुत आसान हो जाता है। + +--- + +## Step 2: Run OCR on the image and obtain raw results + +अब हम इमेज को इंजन में फीड करते हैं। `recognize` कॉल एक `OcrResult` ऑब्जेक्ट रिटर्न करता है जिसमें लाइनों का कलेक्शन होता है, प्रत्येक की अपनी बाउंडिंग रेक्टेंगल होती है। + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +इस समय `raw_result.lines` में दो महत्वपूर्ण एट्रिब्यूट वाले ऑब्जेक्ट्स होते हैं: + +- `text` – उस लाइन के लिए पहचाना गया स्ट्रिंग। +- `bounds` – एक ट्यूपल जैसे `(x, y, width, height)` जो लाइन की पोज़िशन बताता है। + +--- + +## Step 3: Post‑process while preserving geometry + +रॉ OCR आउटपुट अक्सर शोरयुक्त होता है: अनचाहे कैरेक्टर, गलत स्पेस, या लाइन‑ब्रेक समस्याएँ। `ai.run_postprocessor` फ़ंक्शन टेक्स्ट को साफ़ करता है लेकिन **मूल जियोमेट्री को बरकरार रखता है**, इसलिए आपके पास अभी भी सटीक कोऑर्डिनेट्स होते हैं। + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Pro tip:** यदि आपके पास डोमेन‑स्पेसिफिक शब्दावली (जैसे प्रोडक्ट कोड) है, तो पोस्ट‑प्रोसेसर को कस्टम डिक्शनरी पास करें ताकि एक्यूरेसी बढ़े। + +--- + +## Step 4: Extract text with coordinates – iterate and display + +अंत में, हम क्लीन की गई लाइनों पर लूप लगाते हैं, प्रत्येक लाइन के बाउंडिंग बॉक्स को उसके टेक्स्ट के साथ प्रिंट करते हैं। यही **coordinates के साथ टेक्स्ट एक्सट्रैक्ट** करने का मुख्य भाग है। + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Expected Output + +मान लीजिए इनपुट इमेज में दो लाइनें हैं: “Invoice #12345” और “Total: $89.99”, तो आउटपुट कुछ इस तरह दिखेगा: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +पहला ट्यूपल मूल इमेज पर लाइन का `(x, y, width, height)` दर्शाता है, जिससे आप रेक्टेंगल ड्रॉ कर सकते हैं, टेक्स्ट हाईलाइट कर सकते हैं, या कोऑर्डिनेट्स को किसी अन्य सिस्टम में फीड कर सकते हैं। + +--- + +## Visualising the Result (Optional) + +यदि आप बाउंडिंग बॉक्स को इमेज पर ओवरले देखना चाहते हैं, तो आप Pillow (PIL) का उपयोग करके रेक्टेंगल ड्रॉ कर सकते हैं। नीचे एक छोटा स्निपेट है; यदि आपको केवल रॉ डेटा चाहिए तो इसे स्किप कर सकते हैं। + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image उदाहरण जिसमें बाउंडिंग बॉक्स दिखाए गए हैं](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +ऊपर का alt टेक्स्ट **primary keyword** को शामिल करता है, जिससे इमेज alt एट्रिब्यूट के SEO आवश्यकताओं को पूरा किया गया है। + +--- + +## Why Structured OCR Recognition Beats Simple Text Extraction + +आप सोच सकते हैं, “क्या मैं सिर्फ OCR चलाकर टेक्स्ट नहीं ले सकता? जियोमेट्री की क्या जरूरत?” + +- **स्पैशियल कॉन्टेक्स्ट:** जब आपको फ़ॉर्म पर फ़ील्ड्स (जैसे “Date” के बगल में डेट वैल्यू) मैप करनी हों, तो कोऑर्डिनेट्स बताते हैं कि डेटा *कहाँ* है। +- **मल्टी‑कॉलम लेआउट:** साधारण लीनियर टेक्स्ट क्रम खो देता है; Structured डेटा कॉलम ऑर्डर को बरकरार रखता है। +- **पोस्ट‑प्रोसेसिंग एक्यूरेसी:** बॉक्स साइज जानने से आप तय कर सकते हैं कि शब्द हेडर है, फुटनोट है, या कोई अनचाहा आर्टिफैक्ट। + +संक्षेप में, **structured OCR recognition** आपको smarter पाइपलाइन बनाने की लचीलापन देता है—चाहे आप डेटा को डेटाबेस में फीड कर रहे हों, सर्चेबल PDF बना रहे हों, या लेआउट को समझने वाले मशीन‑लर्निंग मॉडल को ट्रेन कर रहे हों। + +--- + +## Common Edge Cases and How to Handle Them + +| Situation | What to Watch For | Suggested Fix | +|-----------|-------------------|---------------| +| **Rotated or skewed images** | Bounding boxes may be off‑axis. | Pre‑process with deskewing (e.g., OpenCV’s `warpAffine`). | +| **Very small fonts** | Engine may miss characters, leading to empty lines. | Increase image resolution or use `ocr_engine.set_dpi(300)`. | +| **Mixed languages** | Wrong language model can cause garbled text. | Set `ocr_engine.language = ["en", "de"]` before recognition. | +| **Overlapping boxes** | Post‑processor might merge two lines unintentionally. | Verify `line.bounds` after processing; adjust thresholds in `ai.run_postprocessor`. | + +इन परिदृश्यों को शुरुआती चरण में संभालना बाद में सिरदर्द बचाता है, विशेषकर जब आप समाधान को रोज़ाना सैकड़ों दस्तावेज़ों तक स्केल करते हैं। + +--- + +## Full End‑to‑End Script + +नीचे पूरा, तैयार‑चलाने‑योग्य प्रोग्राम दिया गया है जो सभी स्टेप्स को जोड़ता है। कॉपी‑पेस्ट करें, इमेज पाथ एडजस्ट करें, और आप तैयार हैं। + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +इस स्क्रिप्ट को चलाने से: + +1. **Run OCR on image** structured मोड के साथ। +2. हर लाइन के लिए **coordinates के साथ टेक्स्ट एक्सट्रैक्ट**। +3. वैकल्पिक रूप से एनोटेटेड PNG बनता है जिसमें बॉक्स दिखते हैं। + +--- + +## Conclusion + +अब आपके पास **run OCR on image** और **coordinates के साथ टेक्स्ट एक्सट्रैक्ट** करने के लिए एक ठोस, स्व-समाहित समाधान है, जो **structured OCR recognition** का उपयोग करता है। कोड हर चरण—इंजन इनिशियलाइज़ेशन से पोस्ट‑प्रोसेसिंग और विज़ुअल वेरिफिकेशन—को दर्शाता है, जिससे आप इसे रसीदों, फ़ॉर्म्स या किसी भी विज़ुअल डॉक्यूमेंट पर लागू कर सकते हैं जिसे सटीक टेक्स्ट लोकेलाइज़ेशन चाहिए। + +अब क्या अगला कदम? `aocr` इंजन को किसी अन्य लाइब्रेरी (Tesseract, EasyOCR) से बदलें और देखें कि उनका structured आउटपुट कैसे अलग है। विभिन्न पोस्ट‑प्रोसेसिंग स्ट्रेटेजी जैसे स्पेल‑चेकिंग या कस्टम रेगेक्स फ़िल्टर आज़माएँ ताकि आपके डोमेन की एक्यूरेसी बढ़े। और यदि आप बड़ा पाइपलाइन बना रहे हैं, तो `(text, bounds)` पेयर्स को डेटाबेस में स्टोर करने पर विचार करें ताकि बाद में एनालिटिक्स आसान हो सके। + +Happy coding, और आपके OCR प्रोजेक्ट्स हमेशा सटीक रहें! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/hongkong/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/hongkong/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..3cf4fdae0 --- /dev/null +++ b/ocr/hongkong/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,228 @@ +--- +category: general +date: 2026-05-03 +description: 使用 Aspose OCR 與 AI 拼寫檢查從圖片提取文字。了解如何對圖片進行 OCR、載入圖片以進行 OCR、從發票中辨識文字以及釋放 + GPU 資源。 +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: zh-hant +og_description: 使用 Aspose OCR 與 AI 拼寫檢查從圖片提取文字。逐步指南,涵蓋如何對圖片進行 OCR、載入圖片以進行 OCR,以及釋放 + GPU 資源。 +og_title: 從圖片提取文字 – 完整 OCR 與拼寫檢查指南 +tags: +- OCR +- Aspose +- AI +- Python +title: 從圖片提取文字 – 結合 Aspose AI 拼寫檢查的 OCR +url: /zh-hant/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# 從圖像提取文字 – 完整 OCR 與拼寫檢查指南 + +是否曾需要 **從圖像提取文字**,卻不確定哪個函式庫能同時提供速度與準確度?你並不孤單。在許多實務專案中——例如發票處理、收據數位化或合約掃描——從圖片取得乾淨、可搜尋的文字是第一道關卡。 + +好消息是,Aspose OCR 搭配輕量級的 Aspose AI 模型,只需幾行 Python 程式碼即可完成這項工作。在本教學中,我們將一步步說明 **如何 OCR 圖像**、正確載入圖片、執行內建的拼寫檢查後處理,最後 **釋放 GPU 資源**,讓你的應用程式保持記憶體友好。 + +閱讀完本指南後,你將能 **辨識發票圖像中的文字**、自動校正常見的 OCR 錯誤,並在下一批次處理前保持 GPU 整潔。 + +--- + +## 需要的環境 + +- Python 3.9 或更新版本(程式碼使用型別提示,但在較早的 3.x 版本亦可執行) +- `aspose-ocr` 與 `aspose-ai` 套件(透過 `pip install aspose-ocr aspose-ai` 安裝) +- 支援 CUDA 的 GPU 為可選項目;若未偵測到 GPU,腳本會自動回退至 CPU。 +- 範例圖片,例如 `sample_invoice.png`,放置於可參照的資料夾中。 + +不需要大型機器學習框架,也不需要龐大的模型下載——只要一個小型的 Q4‑K‑M 量化模型,即可在大多數 GPU 上順暢運行。 + +--- + +## 第一步:初始化 OCR 引擎 – 從圖像提取文字 + +首先建立 `OcrEngine` 實例,並告訴它預期的語言。此處選擇英文,並要求純文字輸出,這對後續處理最為理想。 + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**為什麼這很重要:** 設定語言會縮小字元集,提升辨識準確度。純文字模式會去除版面資訊,當你只想從圖像提取文字時,這正是你需要的。 + +--- + +## 第二步:載入圖像以供 OCR – 如何 OCR 圖像 + +接著將實際的圖片提供給引擎。`Image.load` 輔助函式支援常見格式(PNG、JPEG、TIFF),並抽象化檔案 I/O 的細節。 + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**小技巧:** 若來源圖片尺寸過大,建議先將其縮小後再送入引擎;較小的尺寸可減少 GPU 記憶體使用,同時不會明顯影響辨識品質。 + +--- + +## 第三步:設定 Aspose AI 模型 – 辨識發票文字 + +Aspose AI 內建一個可自動下載的微型 GGUF 模型。範例使用 `Qwen2.5‑3B‑Instruct‑GGUF` 儲存庫,量化為 `q4_k_m`。我們同時指示執行環境在 GPU 上配置 20 層,以在速度與 VRAM 使用之間取得平衡。 + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**背後原理:** 量化模型在磁碟上約 1.5 GB,遠小於完整精度模型,卻仍保有足夠的語言細節,足以偵測典型的 OCR 拼寫錯誤。 + +--- + +## 第四步:初始化 AsposeAI 並掛載拼寫檢查後處理器 + +Aspose AI 內建即用的拼寫檢查後處理器。將它掛上後,所有 OCR 結果都會自動被清理。 + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**為什麼要使用後處理器?** OCR 引擎常把 “Invoice” 誤讀為 “Invo1ce”,或把 “Total” 誤讀為 “T0tal”。拼寫檢查會使用輕量語言模型對原始字串進行校正,無需自行編寫字典。 + +--- + +## 第五步:對 OCR 結果執行拼寫檢查後處理器 + +所有設定完成後,只要一次呼叫即可取得校正後的文字。我們同時印出原始與清理過的版本,讓你直觀比較改進效果。 + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +發票的典型輸出可能如下所示: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +可見 “Invo1ce” 已被正確轉換為 “Invoice”。這就是內建 AI 拼寫檢查的威力。 + +--- + +## 第六步:釋放 GPU 資源 – 安全釋放 GPU 資源 + +如果你在長時間執行的服務(例如每分鐘處理數十張發票的 Web API)中使用此腳本,必須在每個批次後釋放 GPU 上下文,否則會出現記憶體洩漏,最終導致 “CUDA out of memory” 錯誤。 + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**專業提示:** 在 `finally` 區塊或 context manager 中呼叫 `free_resources()`,確保即使發生例外也能執行釋放。 + +--- + +## 完整範例程式 + +將上述所有片段組合,即可得到一個可直接放入任何專案的獨立腳本。 + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +將檔案儲存、調整圖片路徑,然後執行 `python extract_text_from_image.py`。你應該會在主控台看到已清理的發票文字。 + +--- + +## 常見問題 (FAQ) + +**Q: 這能在只有 CPU 的機器上執行嗎?** +A: 完全可以。若未偵測到 GPU,Aspose AI 會自動回退至 CPU 執行,雖然速度較慢。你也可以透過設定 `model_cfg.gpu_layers = 0` 強制使用 CPU。 + +**Q: 若我的發票使用非英文語言,該怎麼辦?** +A: 將 `ocr_engine.language` 改為相應的列舉值(例如 `aocr.Language.Spanish`)。拼寫檢查模型支援多語言,但使用針對特定語言的模型可能會得到更佳結果。 + +**Q: 能否在迴圈中處理多張圖片?** +A: 可以。只要把載入、辨識與後處理步驟放入 `for` 迴圈即可。若重複使用同一個 AI 實例,別忘了在迴圈結束或每個批次後呼叫 `ocr_ai.free_resources()`。 + +**Q: 模型下載大小是多少?** +A: 量化的 `q4_k_m` 版本約 1.5 GB。首次執行後會快取於本機,之後的執行即時完成。 + +--- + +## 結論 + +本教學示範了如何使用 Aspose OCR **從圖像提取文字**、設定微型 AI 模型、套用拼寫檢查後處理器,並安全 **釋放 GPU 資源**。整個流程涵蓋從載入圖片到清理資源的全部步驟,為 **辨識發票文字** 場景提供可靠的管線。 + +接下來的步驟?可以嘗試將拼寫檢查換成自訂的實體抽取模型。 + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/hongkong/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/hongkong/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..6104f5155 --- /dev/null +++ b/ocr/hongkong/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,212 @@ +--- +category: general +date: 2026-05-03 +description: 如何使用 Aspose OCR 與 AI 拼寫檢查批次處理圖片。學習從圖片提取文字、套用拼寫檢查、使用免費 AI 資源及校正 OCR 錯誤。 +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: zh-hant +og_description: 如何使用 Aspose OCR 與 AI 拼寫檢查批次處理圖片文字辨識。跟隨一步一步的指南,從圖片中提取文字、套用拼寫檢查、使用免費 + AI 資源,並校正 OCR 錯誤。 +og_title: 如何使用 Aspose OCR 進行批次 OCR – 完整 Python 教學 +tags: +- OCR +- Python +- AI +- Aspose +title: 如何使用 Aspose OCR 進行批次 OCR – 完整 Python 指南 +url: /zh-hant/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# 如何使用 Aspose OCR 進行批次 OCR – 完整 Python 教學 + +有沒有想過 **如何批次 OCR** 整個資料夾的掃描 PDF 或相片,而不必為每個檔案寫一個獨立腳本?你並不孤單。在許多實務工作流程中,你需要 **從影像中擷取文字**、修正拼寫錯誤,最後釋放已分配的 AI 資源。本教學將一步步示範如何使用 Aspose OCR(輕量級 AI 後處理器)以及幾行 Python 程式碼完成這些工作。 + +我們會說明如何初始化 OCR 引擎、掛接 AI 拼寫檢查器、遍歷圖片目錄,並在最後清理模型。完成後,你將擁有一個可直接執行的腳本,能 **自動校正 OCR 錯誤** 並釋放 **免費的 AI 資源**,讓 GPU 保持順暢。 + +## 需要的環境 + +- Python 3.9+(程式碼使用型別提示,但在較早的 3.x 版本亦可執行) +- `asposeocr` 套件(`pip install asposeocr`)– 提供 OCR 引擎 +- 取得 Hugging Face 模型 `bartowski/Qwen2.5-3B-Instruct-GGUF`(會自動下載) +- 具備至少數 GB VRAM 的 GPU(腳本預設 `gpu_layers = 30`,如有需要可降低) + +不需要外部服務,也不需要付費 API – 全部在本機執行。 + +--- + +## 步驟 1:設定 OCR 引擎 – **如何有效率地批次 OCR** + +在處理上千張圖片之前,我們需要一個穩固的 OCR 引擎。Aspose OCR 讓我們在一次呼叫中選擇語言與辨識模式。 + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**為什麼這很重要:** 將 `recognize_mode` 設為 `Plain` 可讓輸出保持輕量,這在之後要執行拼寫檢查時非常理想。若需要版面資訊,可改為 `Layout`,但會增加不必要的負擔,對批次作業而言不太適合。 + +> **小技巧:** 若要處理多語言掃描,可傳入類似 `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]` 的清單。 + +--- + +## 步驟 2:初始化 AI 後處理器 – **對 OCR 輸出套用拼寫檢查** + +Aspose AI 內建可自行掛載的後處理器,我們從 Hugging Face 下載量化版 Qwen 2.5 模型,並將拼寫檢查流程掛上。 + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**為什麼這很重要:** 模型已量化為 `q4_k_m`,可大幅降低記憶體使用量,同時仍保有不錯的語言理解能力。呼叫 `set_post_processor` 後,Aspose AI 會自動在任何傳入的字串上執行 **apply spell check** 步驟。 + +> **注意:** 若 GPU 無法支援 30 層,請將數值降至 15 或甚至 5 – 腳本仍能運作,只是速度會稍慢。 + +--- + +## 步驟 3:對單張圖片執行 OCR 並 **校正 OCR 錯誤** + +OCR 引擎與 AI 拼寫檢查器都準備好後,我們將它們結合。此函式會載入圖片、擷取原始文字,然後使用 AI 後處理器進行清理。 + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**為什麼這很重要:** 直接把原始 OCR 文字送入 AI 模型,即可完成 **校正 OCR 錯誤** 的工作,無需自行撰寫正則表達式或自訂字典。模型具備上下文感知能力,能把 “recieve” 修正為 “receive”,甚至更微妙的錯誤。 + +--- + +## 步驟 4:**大量擷取影像文字** – 真正的批次迴圈 + +這裡就是 **如何批次 OCR** 發揮威力的地方。我們遍歷目錄、略過不支援的檔案,並將每個校正後的結果寫入 `.txt` 檔。 + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### 預期輸出 + +若影像內的句子為 *“The quick brown fox jumps over the lazzy dog.”*,產生的文字檔會顯示: + +``` +The quick brown fox jumps over the lazy dog. +``` + +可以看到雙「z」已自動更正 – 這就是 AI 拼寫檢查的效果。 + +**為什麼這很重要:** 我們只 **建立一次** OCR 與 AI 物件,並重複使用,避免每處理一個檔案就重新載入模型。這是以規模化方式 **如何批次 OCR** 最有效的做法。 + +--- + +## 步驟 5:清理 – 正確 **釋放 AI 資源** + +作業結束後,呼叫 `free_resources()` 會釋放 GPU 記憶體、CUDA 內容以及模型產生的暫存檔。 + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +若省略此步驟,GPU 可能會留下未釋放的記憶體,導致後續 Python 程式當機或耗盡 VRAM。把它想成批次工作中的「關燈」程序。 + +--- + +## 常見問題與額外技巧 + +| 問題 | 觀察現象 | 解決方式 | +|------|----------|----------| +| **記憶體不足** | GPU 在處理數十張圖片後崩潰 | 降低 `gpu_layers`,或改用 CPU(`model_cfg.gpu_layers = 0`) | +| **缺少語言套件** | OCR 回傳空字串 | 確認 `asposeocr` 版本已包含英文語言資料;必要時重新安裝 | +| **非影像檔案** | 腳本在偶發的 `.pdf` 上當機 | `if not file_name.lower().endswith(...)` 判斷已自動跳過 | +| **拼寫檢查未套用** | 輸出與原始 OCR 完全相同 | 確認在迴圈前已呼叫 `ai_processor.set_post_processor` | +| **批次速度慢** | 每張圖片耗時 >5 秒 | 第一次執行後將 `model_cfg.allow_auto_download = "false"`,避免每次都重新下載模型 | + +**小技巧:** 若要 **擷取影像文字** 的語言不是英文,只需將 `ocr_engine.language` 改成對應的列舉值(例如 `aocr.Language.French`)。同樣的 AI 後處理器仍會執行拼寫檢查,但若想取得最佳效果,建議使用相同語言的模型。 + +--- + +## 重點回顧與後續步驟 + +我們已完整說明 **如何批次 OCR** 的全流程: + +1. **初始化** 只輸出純文字的 OCR 引擎(英文)。 +2. **設定** AI 拼寫檢查模型,並將其綁定為後處理器。 +3. **執行** OCR,讓 AI **自動校正 OCR 錯誤**。 +4. **遍歷** 目錄,以批次方式 **擷取影像文字**。 +5. **釋放** AI 資源,結束工作。 + +接下來你可以: + +- 將校正後的文字送入下游 NLP 流程(情感分析、實體抽取等)。 +- 以 `ai_processor.set_post_processor(your_custom_func, {})` 替換拼寫檢查,改為自訂摘要器。 +- 若 GPU 能同時處理多條串流,可使用 `concurrent.futures.ThreadPoolExecutor` 平行化資料夾迴圈。 + +--- + +## 結語 + +批次 OCR 不必是繁重的工作。結合 Aspose OCR 與輕量級 AI 模型,你即可得到一個 **一站式解決方案**,同時 **擷取影像文字**、**套用拼寫檢查**、**校正 OCR 錯誤**,並 **乾淨地釋放 AI 資源**。先在測試資料夾上跑跑腳本,依硬體調整 GPU 層數,即可在數分鐘內建置上線級別的流水線。 + +對模型微調、PDF 處理或整合至 Web 服務有任何疑問?歡迎在下方留言或於 GitHub 私訊我。祝開發順利,願你的 OCR 永遠精準! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/hongkong/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/hongkong/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..b751010d2 --- /dev/null +++ b/ocr/hongkong/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,296 @@ +--- +category: general +date: 2026-05-03 +description: Python OCR 教學,示範如何載入 PNG 圖像檔案、辨識圖像文字,並提供免費 AI 資源以進行批量 OCR 處理。 +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: zh-hant +og_description: Python OCR 教學將指導你如何載入 PNG 圖像、從圖像中辨識文字,以及處理免費 AI 資源以進行批次 OCR 處理。 +og_title: Python OCR 教學 – 使用免費 AI 資源快速批次 OCR +tags: +- OCR +- Python +- AI +title: Python OCR 教學 – 批量 OCR 處理變得簡單 +url: /zh-hant/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR 教學 – 批次 OCR 處理變得簡單 + +有沒有需要一個 **python ocr tutorial**,真的可以讓你一次處理數十個 PNG 檔案而不抓狂?你並不孤單。在許多實務專案中,你必須 **load png image** 檔案,將它們送入引擎,然後在完成後清理 AI 資源。 + +在本指南中,我們將一步步走過一個完整、可直接執行的範例,說明如何 **recognize text from image** 檔案、批次處理,並釋放底層 AI 記憶體。最後,你會得到一個可直接放入任何專案的自包含腳本——沒有多餘的贅述,只有必要的核心。 + +## 需要的條件 + +- Python 3.10 或更新版本(此處使用的語法依賴 f‑strings 與 type hints) +- 一個提供 `engine.recognize` 方法的 OCR 套件——示範中我們假設有一個虛構的 `aocr` 套件,你也可以改用 Tesseract、EasyOCR 等。 +- 文章中程式碼片段所示的 `ai` 輔助模組(負責模型初始化與資源清理) +- 一個放有欲處理 PNG 檔案的資料夾 + +如果你尚未安裝 `aocr` 或 `ai`,可以使用下方的「可選 Stubs」來模擬——請參考最後的說明。 + +## 步驟 1:初始化 AI 引擎(Free AI Resources) + +在將任何影像送入 OCR 流程之前,必須先讓底層模型就緒。只初始化一次即可節省記憶體並加速批次作業。 + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**為什麼這很重要:** +若對每張影像都重複呼叫 `ai.initialize`,會不斷分配 GPU 記憶體,最終導致腳本崩潰。透過檢查 `ai.is_initialized()`,我們保證只分配一次——這就是「Free AI Resources」的原則。 + +## 步驟 2:載入 PNG 影像檔案以進行批次 OCR 處理 + +現在我們收集所有要進行 OCR 的 PNG 檔案。使用 `pathlib` 可讓程式碼保持跨平台。 + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**邊緣情況:** +如果資料夾中有非 PNG 檔案(例如 JPEG),會被忽略,避免 `engine.recognize` 因不支援的格式而失敗。 + +## 步驟 3:對每張影像執行 OCR 並進行後處理 + +在引擎就緒且檔案清單準備好之後,我們可以逐一迭代影像,取得原始文字,並交給後處理器清理常見的 OCR 雜訊(例如多餘的換行)。 + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**為什麼要將載入與辨識分開:** +`aocr.Image.load` 可能採用延遲解碼,對大型批次更快。將載入步驟寫得明確,也方便日後改用其他影像函式庫(例如處理 JPEG 或 TIFF)時直接替換。 + +## 步驟 4:清理 – 在批次結束後釋放 AI 資源 + +批次處理完成後,我們必須釋放模型,以免記憶體泄漏,特別是在有 GPU 的機器上。 + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## 完整腳本 – 把四個步驟串起來 + +以下是一個單一檔案,將上述四個步驟整合成完整工作流程。將其存為 `batch_ocr.py`,然後在命令列執行。 + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### 預期輸出 + +對包含三個 PNG 的資料夾執行腳本時,可能會印出: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +`ocr_results.txt` 檔案會為每張影像加入清晰的分隔符,並寫入清理過的 OCR 文字。 + +## 可選 Stubs(若沒有真實的 aocr 與 ai 套件) + +如果你只想測試流程,而不想安裝大型 OCR 套件,可以建立最小的模擬模組: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +將這兩個資料夾放在 `batch_ocr.py` 同一目錄下,腳本即可執行,並印出模擬結果。 + +## 專業小技巧與常見陷阱 + +- **記憶體峰值**:若處理成千上萬張高解析度 PNG,建議在 OCR 前先縮小尺寸。`aocr.Image.load` 通常接受 `max_size` 參數。 +- **Unicode 處理**:務必以 `encoding="utf-8"` 開啟輸出檔案;OCR 引擎可能會產生非 ASCII 字元。 +- **平行處理**:對於 CPU‑bound 的 OCR,可以將 `ocr_batch` 包在 `concurrent.futures.ThreadPoolExecutor` 中執行。但記得只保留單一 `ai` 實例——若每個執行緒都呼叫 `ai.initialize`,會破壞「Free AI Resources」的目標。 +- **錯誤韌性**:將每張影像的迴圈包在 `try/except` 中,避免單一損壞的 PNG 中斷整個批次。 + +## 結語 + +現在你已掌握一個 **python ocr tutorial**,示範如何 **load png image** 檔案、執行 **batch OCR processing**,以及負責任地管理 **Free AI Resources**。完整、可執行的範例清楚說明了如何 **recognize text from image** 物件並在之後清理資源,讓你可以直接複製貼上到自己的專案中,而不必再尋找缺失的部份。 + +準備好下一步了嗎?試著將這裡的 stub `aocr` 與 `ai` 模組換成真實的庫,例如 `pytesseract` 或 `torchvision`。你也可以擴充腳本,輸出 JSON、寫入資料庫,或整合雲端儲存桶。可能性無限——祝你寫程式開心! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/hongkong/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/hongkong/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..5597f3a1c --- /dev/null +++ b/ocr/hongkong/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,252 @@ +--- +category: general +date: 2026-05-03 +description: 學習如何在圖像上執行 OCR,並使用結構化 OCR 識別提取帶有座標的文字。附有逐步 Python 程式碼。 +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: zh-hant +og_description: 使用結構化 OCR 識別對圖像執行文字辨識,並取得帶座標的文字。完整的 Python 範例與說明。 +og_title: 在圖像上執行 OCR – 結構化文字提取教學 +tags: +- OCR +- Python +- Computer Vision +title: 在圖像上執行 OCR – 結構化文字提取完整指南 +url: /zh-hant/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# 在圖像上執行 OCR – 結構化文字提取完整指南 + +曾經需要 **run OCR on image** 檔案,但不確定如何保留每個字的精確位置嗎?你並不孤單。在許多專案——收據掃描、表單數位化或 UI 測試——中,你不僅需要原始文字,還需要告訴你每行在圖片上所在位置的邊界框。 + +本教學示範如何使用 **aocr** 引擎 *run OCR on image*、請求 **structured OCR recognition**,並在保留幾何資訊的同時對結果進行後處理。完成後,你只需幾行 Python 程式碼即可 **extract text with coordinates**,同時了解結構化模式對下游任務的重要性。 + +## 你將學會 + +- 如何為 **structured OCR recognition** 初始化 OCR 引擎。 +- 如何輸入圖像並取得包含行邊界的原始結果。 +- 如何執行後處理器,在不失去幾何資訊的前提下清理文字。 +- 如何遍歷最終的行,將文字與其邊界框一起列印。 + +沒有魔法,沒有隱藏步驟——只是一個完整、可直接執行的範例,讓你可以直接套用到自己的專案中。 + +--- + +## 前置條件 + +在開始之前,請確保已安裝以下項目: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +你還需要一張包含清晰可讀文字的圖像檔(`input_image.png` 或 `.jpg`)。無論是掃描的發票還是螢幕截圖,只要 OCR 引擎能辨識文字即可。 + +--- + +## 步驟 1:為結構化辨識初始化 OCR 引擎 + +首先,我們建立 `aocr.Engine()` 的實例,並告訴它我們需要 **structured OCR recognition**。結構化模式不僅返回純文字,還會提供每行的幾何資料(邊界矩形),這在需要將文字映射回圖像時相當關鍵。 + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **為什麼這很重要:** +> 在預設模式下,引擎可能只給你一串連接的文字。結構化模式則提供頁面 → 行 → 詞的層級結構,且每個元素都有座標,使得在原始圖像上疊加結果或輸入到具版面感知的模型中變得更容易。 + +--- + +## 步驟 2:在圖像上執行 OCR 並取得原始結果 + +現在將圖像送入引擎。`recognize` 呼叫會回傳一個 `OcrResult` 物件,裡面包含多行,每行都有自己的邊界矩形。 + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +此時 `raw_result.lines` 內的物件具有兩個重要屬性: + +- `text` – 該行辨識出的字串。 +- `bounds` – 形如 `(x, y, width, height)` 的元組,描述該行的位置。 + +--- + +## 步驟 3:在保留幾何資訊的同時進行後處理 + +原始 OCR 輸出常常雜訊較多:零星字符、錯位空格或換行問題。`ai.run_postprocessor` 函式會清理文字,但 **保留原始幾何資訊** 完好,讓你仍擁有精確的座標。 + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **專業提示:** 若你有領域特定的詞彙(例如產品代碼),可將自訂字典提供給後處理器,以提升準確度。 + +--- + +## 步驟 4:提取帶座標的文字 – 迭代並顯示 + +最後,我們遍歷清理過的行,將每行的邊界框與文字一起列印。這就是 **extract text with coordinates** 的核心。 + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### 預期輸出 + +假設輸入圖像包含兩行文字:「Invoice #12345」與「Total: $89.99」,你會看到類似以下的輸出: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +第一個元組是原始圖像上該行的 `(x, y, width, height)`,可用來繪製矩形、標示文字,或將座標傳入其他系統。 + +--- + +## 可視化結果(可選) + +如果想在圖像上疊加顯示邊界框,可以使用 Pillow(PIL)繪製矩形。以下是一段快速示例;若只需要原始資料,可自行略過。 + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![在圖像上執行 OCR 示例 – 顯示邊界框](/images/ocr-bounding-boxes.png "在圖像上執行 OCR – 邊界框疊加") + +上述 alt 文字包含 **主要關鍵字**,符合圖片 alt 屬性的 SEO 要求。 + +--- + +## 為何結構化 OCR 辨識優於簡單文字提取 + +你可能會想,「我只要跑 OCR 拿到文字就好,何必在乎幾何資訊?」 + +- **空間上下文:** 當需要在表單上對應欄位(例如「Date」旁的日期值)時,座標告訴你資料的 *位置*。 +- **多欄位版面:** 簡單的線性文字會失去排序;結構化資料保留欄位順序。 +- **後處理精度:** 知道框的大小可協助判斷文字是標題、腳註,還是雜訊。 + +總而言之,**structured OCR recognition** 為你提供彈性,讓你能建構更智慧的流程——無論是寫入資料庫、產生可搜尋的 PDF,或訓練尊重版面的機器學習模型。 + +--- + +## 常見邊緣案例及處理方式 + +| 情境 | 需留意的地方 | 建議解決方式 | +|-----------|-------------------|---------------| +| **旋轉或傾斜的圖像** | 邊界框可能偏離軸線。 | 先以去斜處理(例如 OpenCV 的 `warpAffine`)作前處理。 | +| **字體過小** | 引擎可能遺漏字符,導致空行。 | 提升圖像解析度或使用 `ocr_engine.set_dpi(300)`。 | +| **混合語言** | 語言模型不符會產生亂碼。 | 在辨識前設定 `ocr_engine.language = ["en", "de"]`。 | +| **重疊的框** | 後處理器可能不小心合併兩行。 | 在處理後檢查 `line.bounds`;必要時調整 `ai.run_postprocessor` 的閾值。 | + +提前處理這些情況,可在日後擴展至每日數百份文件時,避免頭痛問題。 + +--- + +## 完整端到端腳本 + +以下是完整、可直接執行的程式碼,將所有步驟串接起來。複製貼上、調整圖像路徑,即可使用。 + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +執行此腳本將會: + +1. **Run OCR on image** 並使用結構化模式。 +2. 為每一行 **extract text with coordinates**。 +3. 可選地產生顯示框線的 PNG 標註圖。 + +--- + +## 結論 + +你現在擁有一套完整、獨立的解決方案,能夠 **run OCR on image** 並使用 **structured OCR recognition** **extract text with coordinates**。程式碼示範了從引擎初始化、後處理到視覺驗證的每一步,讓你可以將其套用於收據、表單或任何需要精確文字定位的視覺文件。 + +接下來可以嘗試將 `aocr` 引擎換成其他函式庫(如 Tesseract、EasyOCR),觀察它們的結構化輸出差異。也可以實驗不同的後處理策略,例如拼寫檢查或自訂正則表達式過濾,以提升特定領域的準確度。若你正在建構更大型的流水線,考慮將 `(text, bounds)` 配對存入資料庫,以供日後分析使用。 + +祝編程順利,願你的 OCR 專案永遠精準! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/hungarian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/hungarian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..54ebf8bef --- /dev/null +++ b/ocr/hungarian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,212 @@ +--- +category: general +date: 2026-05-03 +description: Képről szöveg kinyerése Aspose OCR és AI helyesírás-ellenőrzés használatával. + Tanulja meg, hogyan kell OCR-t végezni képen, betölteni a képet OCR-hez, felismerni + a számla szövegét, és felszabadítani a GPU erőforrásokat. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: hu +og_description: Szöveg kinyerése képből az Aspose OCR és AI helyesírás-ellenőrző segítségével. + Lépésről lépésre útmutató, amely bemutatja, hogyan kell OCR-t végezni a képen, hogyan + töltsük be a képet OCR-hez, és hogyan szabadítsuk fel a GPU erőforrásokat. +og_title: Képből szöveg kinyerése – Teljes OCR és helyesírás-ellenőrző útmutató +tags: +- OCR +- Aspose +- AI +- Python +title: Szöveg kinyerése képből – OCR az Aspose AI helyesírás-ellenőrzővel +url: /hu/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# szöveg kinyerése képből – Teljes OCR és helyesírás-ellenőrző útmutató + +Valaha szükséged volt **szöveg kinyerésére képből**, de nem tudtad, melyik könyvtár biztosítja a sebességet és a pontosságot? Nem vagy egyedül. Sok valós projektben – gondolj számlafeldolgozásra, nyugták digitalizálására vagy szerződések beolvasására – a tiszta, kereshető szöveg kinyerése a képből az első akadály. + +A jó hír, hogy az Aspose OCR egy könnyű Aspose AI modellel párosítva néhány Python sorral el tudja végezni ezt a feladatot. Ebben az útmutatóban végigvezetünk a **képek OCR‑elésének** lépésein, a kép helyes betöltésén, a beépített helyesírás-ellenőrző utófeldolgozó futtatásán, és végül a **GPU erőforrások felszabadításán**, hogy az alkalmazásod memória‑kímélő maradjon. + +A útmutató végére képes leszel **számlaképek szövegének felismerésére**, a gyakori OCR‑hibákat automatikusan javítani, és a GPU‑t tisztán tartani a következő köteghez. + +--- + +## Amire szükséged lesz + +- Python 3.9 vagy újabb (a kód típusjelöléseket használ, de korábbi 3.x verziókon is működik) +- `aspose-ocr` és `aspose-ai` csomagok (telepítés: `pip install aspose-ocr aspose-ai`) +- A CUDA‑t támogató GPU opcionális; ha nincs, a szkript CPU‑ra vált. +- Egy példakép, pl. `sample_invoice.png`, egy olyan mappában, amelyre hivatkozhatsz. + +Nincs nehéz ML keretrendszer, nincs hatalmas modell letöltés – csak egy kis Q4‑K‑M kvantált modell, amely kényelmesen elfér a legtöbb GPU‑n. + +## 1. lépés: OCR motor inicializálása – szöveg kinyerése képből + +Az első lépés egy `OcrEngine` példány létrehozása, és a várt nyelv megadása. Itt az angolt választjuk, és egyszerű szöveg (plain‑text) kimenetet kérünk, ami ideális a további feldolgozáshoz. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Miért fontos:** A nyelv beállítása szűkíti a karakterkészletet, ezáltal javítva a pontosságot. Az egyszerű szöveg mód eltávolítja a formázási információkat, amelyeket általában nem használsz, ha csak szöveget akarsz kinyerni a képből. + +## 2. lépés: Kép betöltése OCR‑hez – hogyan OCR‑eljük a képet + +Most egy tényleges képet adunk a motorhoz. A `Image.load` segédfüggvény ismeri a gyakori formátumokat (PNG, JPEG, TIFF), és elrejti a fájl‑IO sajátosságait. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Tipp:** Ha a forrásképek nagyok, fontold meg azok átméretezését, mielőtt a motorhoz küldenéd; a kisebb méretek csökkenthetik a GPU memóriahasználatot anélkül, hogy rontanák a felismerés minőségét. + +## 3. lépés: Aspose AI modell konfigurálása – szöveg felismerése számláról + +Az Aspose AI egy apró GGUF modellt tartalmaz, amelyet automatikusan letölthetsz. A példa a `Qwen2.5‑3B‑Instruct‑GGUF` tárolót használja, `q4_k_m` kvantálással. A futtatókörnyezetnek azt is megmondjuk, hogy a GPU‑ra 20 réteget foglaljon, ami egyensúlyt teremt a sebesség és a VRAM használat között. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**A háttérben:** A kvantált modell körülbelül 1,5 GB helyet foglal a lemezen, ami csak töredéke egy teljes pontosságú modellnek, de még így is elegendő nyelvi finomságot tartalmaz ahhoz, hogy a tipikus OCR‑helyesírási hibákat felismerje. + +## 4. lépés: AsposeAI inicializálása és a helyesírás-ellenőrző utófeldolgozó csatolása + +Az Aspose AI tartalmaz egy kész helyesírás-ellenőrző utófeldolgozót. Ha csatolod, minden OCR eredmény automatikusan megtisztul. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Miért használjuk az utófeldolgozót?** Az OCR motorok gyakran félreolvassák az „Invoice” szót „Invo1ce”‑ként vagy a „Total” szót „T0tal”‑ként. A helyesírás-ellenőrző egy könnyű nyelvi modellt futtat a nyers szövegen, és kijavítja ezeket a hibákat anélkül, hogy saját szótárat kellene írnod. + +## 5. lépés: Helyesírás-ellenőrző utófeldolgozó futtatása az OCR eredményen + +Miután minden összekapcsolódott, egyetlen hívás adja vissza a javított szöveget. Ki is nyomtatjuk az eredeti és a tisztított változatot, hogy lásd a javulást. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Egy számla tipikus kimenete így nézhet ki: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Vedd észre, hogy a „Invo1ce” helyes szóra, az „Invoice”‑ra változott. Ez a beépített AI helyesírás-ellenőrző ereje. + +## 6. lépés: GPU erőforrások felszabadítása – GPU erőforrások biztonságos felszabadítása + +Ha ezt egy hosszú életű szolgáltatásban futtatod (pl. egy web‑API, amely percenként tucatnyi számlát dolgoz fel), minden köteg után fel kell szabadítanod a GPU kontextust. Ellenkező esetben memória‑szivárgásokat és végül a „CUDA out of memory” hibákat fogod tapasztalni. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Pro tipp:** Hívd meg a `free_resources()`‑t egy `finally` blokkban vagy egy kontextuskezelőben, hogy mindig végrehajtódjon, még akkor is, ha kivétel keletkezik. + +## Teljes működő példa + +Az összes részegység összerakásával egy önálló szkriptet kapsz, amelyet bármely projektbe beilleszthetsz. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Mentsd el a fájlt, állítsd be a képed elérési útját, és futtasd a `python extract_text_from_image.py` parancsot. A konzolon meg kell jelennie a tisztított számlaszövegnek. + +## Gyakran Ismételt Kérdések (GYIK) + +**K: Működik ez csak CPU‑s gépeken?** +V: Teljesen. Ha nincs GPU észlelve, az Aspose AI CPU‑ra vált, bár lassabb lesz. A CPU kényszerítéséhez állítsd be a `model_cfg.gpu_layers = 0` értéket. + +**K: Mi van, ha a számláim nem angol nyelvűek?** +V: Állítsd a `ocr_engine.language`‑t a megfelelő enum értékre (pl. `aocr.Language.Spanish`). A helyesírás-ellenőrző modell többnyelvű, de nyelvspecifikus modellel jobb eredményeket érhetsz el. + +**K: Feldolgozhatok több képet egy ciklusban?** +V: Igen. Csak helyezd a betöltési, felismerési és utófeldolgozási lépéseket egy `for` ciklusba. Ne felejtsd el meghívni a `ocr_ai.free_resources()`‑t a ciklus után vagy minden köteg után, ha ugyanazt az AI példányt használod újra. + +**K: Mekkora a modell letöltése?** +V: Körülbelül 1,5 GB a kvantált `q4_k_m` verzióhoz. Az első futtatás után gyorsítótárazódik, így a későbbi indítások azonnaliak. + +## Összegzés + +Ebben az útmutatóban bemutattuk, hogyan **nyerjünk ki szöveget képből** az Aspose OCR használatával, hogyan konfiguráljunk egy apró AI modellt, alkalmazzunk egy helyesírás-ellenőrző utófeldolgozót, és biztonságosan **felszabadítsuk a GPU erőforrásokat**. A munkafolyamat mindent lefed a kép betöltésétől a saját takarításig, megbízható csővezetéket biztosítva a **számlák szövegének felismerése** esetekhez. + +Következő lépések? Próbáld megcserélni a helyesírás-ellenőrzőt egy egyedi entitás‑kivonó modellre + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/hungarian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/hungarian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..945b17079 --- /dev/null +++ b/ocr/hungarian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,203 @@ +--- +category: general +date: 2026-05-03 +description: Hogyan végezzünk kötegelt OCR-t képeken az Aspose OCR és az AI helyesírás-ellenőrzés + segítségével. Tanulja meg, hogyan nyerjen ki szöveget a képekből, alkalmazzon helyesírás-ellenőrzést, + használjon ingyenes AI erőforrásokat, és javítsa az OCR hibákat. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: hu +og_description: Hogyan végezzünk tömeges OCR-t képeken az Aspose OCR és az AI helyesírás-ellenőrzés + segítségével. Kövesd a lépésről‑lépésre útmutatót a képek szövegének kinyeréséhez, + a helyesírás-ellenőrzés alkalmazásához, az AI erőforrások ingyenes használatához + és az OCR hibák javításához. +og_title: Hogyan végezzünk kötegelt OCR-t az Aspose OCR-rel – Teljes Python útmutató +tags: +- OCR +- Python +- AI +- Aspose +title: Hogyan végezzünk kötegelt OCR-t az Aspose OCR segítségével – Teljes Python + útmutató +url: /hu/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Hogyan végezzünk kötegelt OCR-t az Aspose OCR-rel – Teljes Python útmutató + +Valaha is elgondolkodtál már azon, **hogyan végezzünk kötegelt OCR-t** egy teljes mappában lévő beolvasott PDF-eken vagy fényképeken anélkül, hogy minden fájlhoz külön szkriptet írnál? Nem vagy egyedül. Sok valós világú folyamatban szükség van **szöveg kinyerésére képekből**, a helyesírási hibák javítására, és végül a felhasznált AI erőforrások felszabadítására. Ez az útmutató pontosan megmutatja, hogyan lehet ezt megtenni az Aspose OCR-rel, egy könnyű AI post‑processzorral, és néhány Python sorral. + +Végigvezetünk az OCR motor inicializálásán, egy AI helyesírás-ellenőrző csatlakoztatásán, a képek könyvtárán való iteráláson, és a modell tisztításán. A végére egy kész‑használatra szánt szkriptet kapsz, amely **automatikusan javítja az OCR hibákat** és felszabadítja a **szabad AI erőforrásokat**, így a GPU-d is boldog marad. + +## Amire szükséged lesz + +- Python 3.9+ (a kód típusjelöléseket használ, de korábbi 3.x verziókon is működik) +- `asposeocr` csomag (`pip install asposeocr`) – ez biztosítja az OCR motorját. +- Hozzáférés a Hugging Face modellhez `bartowski/Qwen2.5-3B-Instruct-GGUF` (automatikusan letöltődik). +- GPU, amely legalább néhány GB VRAM-mal rendelkezik (a szkript `gpu_layers = 30`‑at állít be, ha szükséges, csökkenthető). + +Nincs külső szolgáltatás, nincs fizetős API – minden helyben fut. + +--- + +## 1. lépés: Az OCR motor beállítása – **Hogyan végezzünk kötegelt OCR-t** hatékonyan + +Mielőtt ezer képet feldolgozhatnánk, szükségünk van egy stabil OCR motorra. Az Aspose OCR lehetővé teszi, hogy egyetlen hívással válasszunk nyelvet és felismerési módot. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Miért fontos:** A `recognize_mode` `Plain`‑ra állítása könnyűsúlyú kimenetet eredményez, ami ideális, ha később helyesírás-ellenőrzést tervezel. Ha elrendezési információra lenne szükséged, `Layout`‑ra váltanál, de ez plusz terhelést jelent, amit valószínűleg nem akarsz egy kötegelt feladatban. + +> **Pro tipp:** Ha többnyelvű beolvasásokat kezelsz, átadhatsz egy listát, például `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +## 2. lépés: Az AI post‑processzor inicializálása – **Alkalmazd a helyesírás-ellenőrzést** az OCR kimenetre + +Az Aspose AI egy beépített post‑processzorral érkezik, amely bármilyen modellt képes futtatni. Itt egy kvantált Qwen 2.5 modellt töltünk le a Hugging Face‑ről, és csatlakoztatjuk a helyesírás-ellenőrző rutint. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Miért fontos:** A modell kvantált (`q4_k_m`), ami jelentősen csökkenti a memóriahasználatot, miközben még mindig megfelelő nyelvi megértést biztosít. A `set_post_processor` hívásával azt mondjuk az Aspose AI‑nek, hogy automatikusan futtassa a **helyesírás-ellenőrzés alkalmazása** lépést minden általa kapott karakterláncon. + +> **Figyelem:** Ha a GPU-d nem képes 30 réteget kezelni, csökkentsd a számot 15‑re vagy akár 5‑re – a szkript még mindig működni fog, csak valamivel lassabban. + +## 3. lépés: OCR futtatása és **OCR hibák javítása** egyetlen képen + +Most, hogy az OCR motor és az AI helyesírás-ellenőrző is készen áll, kombináljuk őket. Ez a függvény betölt egy képet, kinyeri a nyers szöveget, majd az AI post‑processzort futtatja, hogy megtisztítsa azt. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Miért fontos:** A nyers OCR karakterlánc közvetlenül az AI modellbe való betáplálása egy **OCR hibák javítása** lépést biztosít, anélkül, hogy reguláris kifejezéseket vagy egyedi szótárakat írnánk. A modell érti a kontextust, így javíthatja a “recieve” → “receive” hibát és még finomabb hibákat is. + +## 4. lépés: **Szöveg kinyerése képekből** tömegesen – A valódi kötegelt ciklus + +Itt jön a **hogyan végezzünk kötegelt OCR-t** varázslata. Egy könyvtáron iterálunk, kihagyjuk a nem támogatott fájlokat, és minden javított kimenetet egy `.txt` fájlba írunk. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Várt kimenet + +Egy olyan képnél, amely a *„The quick brown fox jumps over the lazzy dog.”* mondatot tartalmazza, egy szövegfájlt fogsz látni, amely: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Vedd észre, hogy a dupla „z” automatikusan javítva lett – ez az AI helyesírás-ellenőrzés működését mutatja. + +**Miért fontos:** Az OCR és AI objektumok **egyszer** történő létrehozásával és újrahasználatával elkerüljük a modell minden egyes fájlra történő betöltésének terhelését. Ez a leghatékonyabb módja a **kötegelt OCR** skálázásának. + +## 5. lépés: Tisztítás – **AI erőforrások felszabadítása** megfelelően + +Amikor befejezted, a `free_resources()` hívása felszabadítja a GPU memóriát, a CUDA kontextusokat és a modell által létrehozott ideiglenes fájlokat. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Ennek a lépésnek a kihagyása függőben lévő GPU lefoglalásokat hagyhat maga után, amelyek összeomlaszthatják a későbbi Python folyamatokat vagy VRAM-ot fogyaszthatnak. Tekintsd ezt a kötegelt feladat „lámpák kikapcsolása” részének. + +## Gyakori buktatók és extra tippek + +| Probléma | Mire figyelj | Javítás | +|----------|--------------|--------| +| **Memóriahiány hibák** | A GPU néhány tucat kép után kifogy | Csökkentsd a `gpu_layers` értékét vagy válts CPU-ra (`model_cfg.gpu_layers = 0`). | +| **Hiányzó nyelvi csomag** | Az OCR üres karakterláncokat ad vissza | Győződj meg arról, hogy az `asposeocr` verzió tartalmazza az angol nyelvi adatokat; szükség esetén telepítsd újra. | +| **Nem képfájlok** | A szkript összeomlik egy eltévedt `.pdf` fájlon | Az `if not file_name.lower().endswith(...)` feltétel már kihagyja őket. | +| **A helyesírás-ellenőrzés nem lett alkalmazva** | A kimenet azonosnak tűnik a nyers OCR-rel | Ellenőrizd, hogy a `ai_processor.set_post_processor` a ciklus előtt lett-e meghívva. | +| **Lassú kötegelt sebesség** | Képenként >5 másodpercet vesz igénybe | Állítsd be a `model_cfg.allow_auto_download = "false"` értéket az első futtatás után, így a modell nem töltődik le minden alkalommal. | + +**Pro tipp:** Ha **szöveget kell kinyerni képekből** angolon kívül más nyelven, egyszerűen változtasd meg az `ocr_engine.language` értékét a megfelelő enumra (pl. `aocr.Language.French`). ugyanaz a AI post‑processzor továbbra is alkalmazni fogja a helyesírás-ellenőrzést, de a legjobb eredményért érdemes nyelvspecifikus modellt használni. + +## Összefoglalás és következő lépések + +Áttekintettük a teljes folyamatot a **kötegelt OCR** végrehajtásához: + +1. **Inicializáld** egy egyszerű szöveges OCR motort angol nyelvre. +2. **Konfiguráld** egy AI helyesírás-ellenőrző modellt és kösd azt post‑processzorként. +3. **Futtasd** az OCR-t minden képen, és hagyd, hogy az AI **automatikusan javítsa az OCR hibákat**. +4. **Iterálj** egy könyvtáron, hogy **szöveget nyerj ki képekből** tömegesen. +5. **Felszabadítsd** az AI erőforrásokat, amint a feladat befejeződik. + +Innen tovább: + +- A javított szöveget továbbíthatod egy downstream NLP folyamatba (érzelem elemzés, entitás kinyerés, stb.). +- A helyesírás-ellenőrző post‑processzort kicserélheted egy egyedi összefoglalóra a `ai_processor.set_post_processor(your_custom_func, {})` hívásával. +- Párhuzamosíthatod a mappa ciklust a `concurrent.futures.ThreadPoolExecutor`‑rel, ha a GPU több adatfolyamot is képes kezelni. + +## Záró gondolatok + +A OCR kötegelt feldolgozása nem kell, hogy munka legyen. Az Aspose OCR és egy könnyű AI modell kombinálásával egy **egyetlen megoldást** kapsz, amely **szöveget nyer ki képekből**, **alkalmazza a helyesírás-ellenőrzést**, **javítja az OCR hibákat**, és **tisztán felszabadítja az AI erőforrásokat**. Próbáld ki a szkriptet egy tesztmappán, állítsd be a GPU rétegszámot a hardveredhez, és percek alatt egy produkcióra kész folyamatod lesz. + +Van kérdésed a modell finomhangolásával, PDF-ek kezelésével vagy a webszolgáltatásba való integrálással kapcsolatban? Hagyj egy megjegyzést alább vagy írj nekem a GitHub‑on. Boldog kódolást, és legyen az OCR-ed mindig pontos! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/hungarian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/hungarian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..296cd5e09 --- /dev/null +++ b/ocr/hungarian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,300 @@ +--- +category: general +date: 2026-05-03 +description: Python OCR oktató, amely bemutatja, hogyan lehet PNG képfájlokat betölteni, + szöveget felismerni a képről, és ingyenes AI erőforrásokat használni kötegelt OCR + feldolgozáshoz. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: hu +og_description: A Python OCR oktatóanyag végigvezet a PNG képek betöltésén, a képről + történő szövegfelismerésen és az ingyenes AI erőforrások kezelésén a kötegelt OCR + feldolgozáshoz. +og_title: Python OCR oktatóanyag – Gyors kötegelt OCR ingyenes AI erőforrásokkal +tags: +- OCR +- Python +- AI +title: Python OCR útmutató – Kötegelt OCR feldolgozás egyszerűen +url: /hu/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR Bemutató – Kötetes OCR Feldolgozás Egyszerűen + +Valaha is szükséged volt egy **python ocr tutorial**-ra, ami tényleg lehetővé teszi, hogy tucatnyi PNG fájlon futtass OCR-t anélkül, hogy a hajadba foglálkossz? Nem vagy egyedül. Sok valós projektben **load png image** fájlokat kell betölteni, egy motorba táplálni, majd a munka befejezésekor felszabadítani az AI erőforrásokat. + +Ebben az útmutatóban egy komplett, azonnal futtatható példán keresztül mutatjuk be, hogyan **recognize text from image** fájlokból, hogyan dolgozzuk fel őket kötegben, és hogyan szabadítsuk fel a háttérben lévő AI memóriát. A végére egy önálló szkriptet kapsz, amelyet bármelyik projektbe beilleszthetsz – felesleges kiegészítők nélkül, csak a lényeg. + +## Amit Szükséged Van + +- Python 3.10 vagy újabb (a használt szintaxis f‑stringeket és típusjelöléseket igényel) +- Egy OCR könyvtár, amely rendelkezik `engine.recognize` metódussal – a bemutatóhoz egy fiktív `aocr` csomagot feltételezünk, de helyettesítheted Tesseracttal, EasyOCR‑ral stb. +- A kódrészletben látható `ai` segédmodul (kezeli a modell inicializálását és az erőforrások tisztítását) +- Egy mappa tele PNG fájlokkal, amelyeket feldolgozni szeretnél + +Ha nincs `aocr` vagy `ai` telepítve, helyettesítheted őket stubokkal – lásd a „Optional Stubs” szekciót a végén. + +## 1. lépés: Az AI Motor Inicializálása (Free AI Resources) + +Mielőtt bármilyen képet betáplálnál az OCR csővezetékbe, a háttérben lévő modellnek készen kell állnia. Az egyszeri inicializálás memóriát takarít meg és felgyorsítja a kötegelt feladatokat. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Miért fontos:** +Ha minden képnél újra és újra meghívod az `ai.initialize`‑t, a GPU memória folyamatosan újra lefoglalódik, ami végül összeomlasztja a szkriptet. Az `ai.is_initialized()` ellenőrzésével biztosítjuk az egyszeri lefoglalást – ez a „free AI resources” elv. + +## 2. lépés: PNG Képfájlok Betöltése a Kötetes OCR Feldolgozáshoz + +Most összegyűjtjük az összes PNG fájlt, amelyet OCR‑en szeretnénk futtatni. A `pathlib` használata platform‑független kódot eredményez. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Szélső eset:** +Ha a mappában nem‑PNG fájlok (pl. JPEG‑ek) is vannak, azok figyelmen kívül maradnak, így az `engine.recognize` nem akad el egy nem támogatott formátumnál. + +## 3. lépés: OCR Futása Minden Képen és Utófeldolgozás Alkalmazása + +Miután a motor készen áll és a fájllista elkészült, végigiterálhatunk a képeken, kinyerhetjük a nyers szöveget, és átadhatjuk egy utófeldolgozónak, amely megtisztítja a gyakori OCR hibákat (például felesleges sortöréseket). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Miért választjuk szét a betöltést a felismeréstől:** +Az `aocr.Image.load` esetleg lazy dekódolást végez, ami nagy kötegek esetén gyorsabb. A betöltési lépés explicit megadása egyszerűvé teszi egy másik képkönyvtárra való átállást, ha később JPEG‑et vagy TIFF‑et kell kezelni. + +## 4. lépés: Tisztítás – AI Erőforrások Felszabadítása a Köteg Után + +A köteg befejezése után fel kell szabadítanunk a modellt, hogy elkerüljük a memória‑szivárgásokat, különösen GPU‑val rendelkező gépeken. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Az Egész Egyben – A Teljes Szkript + +Az alábbi egyetlen fájl összefűzi a négy lépést egy koherens munkafolyamatba. Mentsd `batch_ocr.py` néven, és futtasd a parancssorból. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Várható Kimenet + +A szkript futtatása egy három PNG‑t tartalmazó mappán például a következőt írhatja ki: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +Az `ocr_results.txt` fájl minden képnél egy egyértelmű elválasztót és a megtisztított OCR‑szöveget tartalmazza. + +## Optional Stubs aocr & ai számára (Ha Nincsenek Valódi Csomagok) + +Ha csak a folyamatot szeretnéd tesztelni anélkül, hogy nehéz OCR könyvtárakat telepítenél, létrehozhatsz minimális mock modulokat: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Helyezd ezeket a mappákat a `batch_ocr.py` mellé, és a szkript mock eredményeket fog nyomtatni. + +## Pro Tippek & Gyakori Hibák + +- **Memória‑csúcsok:** Ha több ezer nagy felbontású PNG‑t dolgozol fel, fontold meg a képek átméretezését OCR előtt. Az `aocr.Image.load` gyakran elfogad egy `max_size` paramétert. +- **Unicode kezelés:** Mindig nyisd meg a kimeneti fájlt `encoding="utf-8"`‑vel; az OCR motorok nem‑ASCII karaktereket is kiadhatnak. +- **Párhuzamosság:** CPU‑korlátú OCR esetén a `ocr_batch`‑ot be lehet csomagolni egy `concurrent.futures.ThreadPoolExecutor`‑be. Ne feledd, hogy egyetlen `ai` példányt kell használni – ha sok szál mindegyikben meghívja az `ai.initialize`‑t, az ellentétes a „free AI resources” céllal. +- **Hibamentesség:** Tekerj egy `try/except` blokkba a képenkénti ciklust, hogy egyetlen sérült PNG ne állítsa le az egész köteget. + +## Összegzés + +Most már rendelkezel egy **python ocr tutorial**‑ral, amely bemutatja, hogyan **load png image** fájlokat, hogyan végezz **batch OCR processing**‑t, és hogyan kezeld felelősen a **free AI resources**‑t. A komplett, futtatható példa pontosan megmutatja, hogyan **recognize text from image** objektumokból, majd tisztítja fel a memóriát, így egyszerűen beillesztheted saját projektjeidbe hiányzó részek keresése nélkül. + +Készen állsz a következő lépésre? Próbáld ki a stub‑olt `aocr` és `ai` modulok helyettesítését valódi könyvtárakkal, például `pytesseract`‑tal és `torchvision`‑nal. Bővítheted a szkriptet JSON‑kimenettel, adatbázisba írással vagy felhő‑tároló bucketbe való integrálással. A lehetőségek végtelenek – jó kódolást! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/hungarian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/hungarian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..5b86ab749 --- /dev/null +++ b/ocr/hungarian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,255 @@ +--- +category: general +date: 2026-05-03 +description: Tanulja meg, hogyan hajtson végre OCR-t egy képen, és hogyan nyerjen + ki szöveget koordinátákkal a strukturált OCR felismerés segítségével. Lépésről lépésre + Python kód is mellékelve. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: hu +og_description: Futtass OCR-t a képen, és szerezd meg a szöveget koordinátákkal strukturált + OCR felismeréssel. Teljes Python példa magyarázatokkal. +og_title: Futtass OCR-t a képen – Strukturált szövegkivonás útmutató +tags: +- OCR +- Python +- Computer Vision +title: Képen OCR futtatása – Teljes útmutató a strukturált szövegkivonáshoz +url: /hu/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Futtass OCR-t képen – Teljes útmutató a strukturált szövegkivonáshoz + +Valaha is szükséged volt **OCR futtatására képen** fájlokon, de nem tudtad, hogyan tartsd meg a szavak pontos pozícióját? Nem vagy egyedül. Sok projektben – például nyugta beolvasás, űrlap digitalizálás vagy UI tesztelés – nem csak a nyers szövegre van szükség, hanem a körülhatároló dobozokra is, amelyek megmutatják, hol helyezkedik el az egyes sorok a képen. + +Ez a bemutató gyakorlati módot mutat be az *OCR futtatására képen* a **aocr** motor segítségével, a **strukturált OCR felismerés** kérésével, majd a geometria megőrzésével történő utófeldolgozással. A végére képes leszel **szöveg és koordináták kinyerésére** néhány Python sorral, és megérted, miért fontos a strukturált mód a további feladatokhoz. + +## Amit megtanulsz + +- Hogyan inicializáld az OCR motort **strukturált OCR felismeréshez**. +- Hogyan adj meg egy képet, és kapj nyers eredményeket, amelyek tartalmazzák a sorok határait. +- Hogyan futtass egy utófeldolgozót, amely megtisztítja a szöveget anélkül, hogy elveszítené a geometriát. +- Hogyan iterálj a végső sorokon, és írd ki a szöveget a hozzá tartozó körülhatároló dobozzal együtt. + +Nincs varázslat, nincs rejtett lépés – csak egy teljes, futtatható példa, amelyet beilleszthetsz a saját projektedbe. + +--- + +## Előfeltételek + +Mielőtt belemerülnénk, győződj meg róla, hogy a következők telepítve vannak: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Szükséged lesz egy képfájlra (`input_image.png` vagy `.jpg`), amely tiszta, olvasható szöveget tartalmaz. Bármilyen, egy beolvasott számla vagy képernyőképről származó kép megfelel, amíg az OCR motor képes felismerni a karaktereket. + +--- + +## 1. lépés: Az OCR motor inicializálása strukturált felismeréshez + +Az első dolog, amit teszünk, hogy létrehozzuk a `aocr.Engine()` példányt, és jelezzük, hogy **strukturált OCR felismerést** szeretnénk. A strukturált mód nem csak a sima szöveget adja vissza, hanem geometriai adatokat (körülhatároló téglalapok) is minden sorhoz, ami elengedhetetlen, ha a szöveget vissza kell térképezni a képre. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Miért fontos:** +> Alapértelmezett módban a motor csak egy összefűzött szövegsorozatot adhat vissza. A strukturált mód egy hierarchiát biztosít: oldalak → sorok → szavak, mindegyik koordinátákkal, ami sokkal egyszerűbbé teszi az eredmények átfedését az eredeti képen vagy egy layout‑érzékeny modellbe való betáplálását. + +--- + +## 2. lépés: OCR futtatása a képen és nyers eredmények lekérése + +Most betápláljuk a képet a motorba. A `recognize` hívás egy `OcrResult` objektumot ad vissza, amely sorok gyűjteményét tartalmazza, mindegyik saját körülhatároló téglalappal. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +Ekkor a `raw_result.lines` olyan objektumokat tartalmaz, amelyek két fontos attribútummal rendelkeznek: + +- `text` – a sorhoz tartozó felismert karakterlánc. +- `bounds` – egy `(x, y, width, height)` alakú tuple, amely leírja a sor pozícióját. + +--- + +## 3. lépés: Utófeldolgozás a geometria megőrzésével + +A nyers OCR kimenet gyakran zajos: eltévedt karakterek, helytelen szóközök vagy sortörés-problémák. Az `ai.run_postprocessor` függvény megtisztítja a szöveget, de **megtartja az eredeti geometriát**, így továbbra is pontos koordinátákkal rendelkezel. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Pro tipp:** Ha domain‑specifikus szókincsed van (pl. termékkódok), adj meg egy egyedi szótárat az utófeldolgozónak a pontosság javítása érdekében. + +--- + +## 4. lépés: Szöveg és koordináták kinyerése – iterálás és megjelenítés + +Végül végigjárjuk a megtisztított sorokat, és kiírjuk minden sor körülhatároló dobozát a szöveg mellett. Ez a **szöveg és koordináták kinyerésének** magja. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Várható kimenet + +Tegyük fel, hogy a bemeneti kép két sort tartalmaz: „Invoice #12345” és „Total: $89.99”. Valami ilyesmit fogsz látni: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +Az első tuple a `(x, y, width, height)` érték, amely a sor helyét jelöli az eredeti képen, lehetővé téve téglalapok rajzolását, szöveg kiemelését vagy a koordináták más rendszerbe való betáplálását. + +--- + +## Az eredmény megjelenítése (opcionális) + +Ha szeretnéd látni a körülhatároló dobozokat a képen, használhatod a Pillow‑t (PIL) a téglalapok rajzolásához. Az alábbi egy gyors snippet; nyugodtan kihagyhatod, ha csak a nyers adatokat akarod. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image example showing bounding boxes](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +A fenti alternatív szöveg tartalmazza a **fő kulcsszót**, ezzel teljesítve a SEO követelményt a kép alt attribútumok számára. + +--- + +## Miért felülmúlja a strukturált OCR felismerés az egyszerű szövegkivonást + +Talán azt kérdezed, „Nem elég csak OCR‑t futtatni és a szöveget kapni? Miért van szükség a geometriára?” + +- **Térbeli kontextus:** Ha egy űrlapon mezőket kell párosítani (pl. „Dátum” a dátumérték mellett), a koordináták megmutatják, *hol* található az adat. +- **Többoszlopos elrendezések:** Az egyszerű lineáris szöveg elveszíti a sorrendet; a strukturált adat megőrzi az oszlopok sorrendjét. +- **Utófeldolgozási pontosság:** A doboz méretének ismerete segít eldönteni, hogy egy szó fejléc, lábjegyzet vagy csak egy véletlen maradvány-e. + +Röviden, a **strukturált OCR felismerés** rugalmasságot ad intelligens csővezetékek építéséhez – legyen szó adatbázisba való betáplálásról, kereshető PDF‑ek létrehozásáról vagy egy olyan gépi tanulási modell tréningjéről, amely tiszteletben tartja a layout‑ot. + +--- + +## Gyakori szélhelyzetek és megoldások + +| Helyzet | Mire figyelj | Javasolt megoldás | +|-----------|-------------------|---------------| +| **Elforgatott vagy ferde képek** | A körülhatároló dobozok eltérhetnek a valóságtól. | Előfeldolgozás deskew‑el (pl. OpenCV `warpAffine`). | +| **Nagyon kicsi betűk** | A motor kihagyhat karaktereket, üres sorok keletkeznek. | Növeld a kép felbontását, vagy használd az `ocr_engine.set_dpi(300)` beállítást. | +| **Vegyes nyelvek** | Rossz nyelvi modell torz szöveget eredményezhet. | Állítsd be `ocr_engine.language = ["en", "de"]` a felismerés előtt. | +| **Átfedő dobozok** | Az utófeldolgozó esetleg két sort egybeolvas. | Ellenőrizd a `line.bounds` értékét a feldolgozás után; állítsd be a küszöböket az `ai.run_postprocessor`‑ben. | + +Ezeknek a szcenárióknak a korai kezelése rengeteg fejfájást takarít meg, különösen ha a megoldást naponta több száz dokumentumra skálázod. + +--- + +## Teljes, vég‑től‑végig script + +Az alábbi a kész, futtatható program, amely összekapcsolja az összes lépést. Másold be, módosítsd a kép útvonalát, és már indulhat is. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +A script futtatása: + +1. **OCR futtatása képen** strukturált módban. +2. **Szöveg és koordináták kinyerése** minden sorhoz. +3. Opcionálisan egy annotált PNG előállítása a dobozokkal. + +--- + +## Összegzés + +Most már egy szilárd, önálló megoldásod van a **OCR futtatására képen** és a **szöveg és koordináták kinyerésére** a **strukturált OCR felismerés** segítségével. A kód minden lépést bemutat – a motor inicializálásától az utófeldolgozáson át a vizuális ellenőrzésig –, így könnyen adaptálható nyugták, űrlapok vagy bármely vizuális dokumentum esetén, amely pontos szöveg‑lokalizációt igényel. + +Mi a következő? Próbáld ki a `aocr` motort egy másik könyvtárral (Tesseract, EasyOCR), és nézd meg, hogyan különböznek a strukturált kimeneteik. Kísérletezz különböző utófeldolgozási stratégiákkal, például helyesírás‑ellenőrzéssel vagy egyedi regex‑szűrőkkel, hogy növeld a pontosságot a saját területedhez. Ha nagyobb csővezetéket építesz, fontold meg a `(text, bounds)` párok adatbázisba való tárolását későbbi elemzésekhez. + +Boldog kódolást, és legyenek a OCR projektjeid mindig pontosak! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/indonesian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/indonesian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..1c6bf8eb1 --- /dev/null +++ b/ocr/indonesian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,230 @@ +--- +category: general +date: 2026-05-03 +description: Ekstrak teks dari gambar menggunakan Aspose OCR dan pemeriksaan ejaan + AI. Pelajari cara melakukan OCR pada gambar, memuat gambar untuk OCR, mengenali + teks dari faktur, dan melepaskan sumber daya GPU. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: id +og_description: Ekstrak teks dari gambar dengan Aspose OCR dan pemeriksaan ejaan AI. + Panduan langkah demi langkah yang mencakup cara melakukan OCR pada gambar, memuat + gambar untuk OCR, dan melepaskan sumber daya GPU. +og_title: Ekstrak teks dari gambar – Panduan Lengkap OCR & Pemeriksaan Ejaan +tags: +- OCR +- Aspose +- AI +- Python +title: Ekstrak teks dari gambar – OCR dengan Aspose AI Spell‑Check +url: /id/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# ekstrak teks dari gambar – Panduan Lengkap OCR & Pemeriksaan Ejaan + +Pernah perlu **ekstrak teks dari gambar** tetapi tidak yakin perpustakaan mana yang memberikan kecepatan dan akurasi? Anda tidak sendirian. Dalam banyak proyek dunia nyata—seperti pemrosesan faktur, digitalisasi struk, atau pemindaian kontrak—mendapatkan teks bersih yang dapat dicari dari sebuah gambar adalah tantangan pertama. + +Kabar baiknya, Aspose OCR yang dipasangkan dengan model Aspose AI yang ringan dapat menangani pekerjaan itu dalam beberapa baris Python. Dalam tutorial ini kami akan menjelaskan **cara OCR gambar**, memuat gambar dengan benar, menjalankan post‑processor pemeriksaan ejaan bawaan, dan akhirnya **melepaskan sumber daya GPU** agar aplikasi Anda tetap ramah memori. + +Pada akhir panduan ini Anda akan dapat **mengenali teks dari gambar faktur**, memperbaiki kesalahan OCR umum secara otomatis, dan menjaga GPU Anda bersih untuk batch berikutnya. + +--- + +## Apa yang Anda Butuhkan + +- Python 3.9 atau lebih baru (kode menggunakan type hints tetapi bekerja pada versi 3.x sebelumnya) +- Paket `aspose-ocr` dan `aspose-ai` (pasang via `pip install aspose-ocr aspose-ai`) +- GPU yang mendukung CUDA bersifat opsional; skrip akan beralih ke CPU jika tidak ada. +- Contoh gambar, misalnya `sample_invoice.png`, ditempatkan di folder yang dapat Anda referensikan. + +Tidak ada kerangka kerja ML berat, tidak ada unduhan model besar—hanya model kuantisasi Q4‑K‑M kecil yang muat dengan nyaman pada kebanyakan GPU. + +--- + +## Langkah 1: Inisialisasi Mesin OCR – ekstrak teks dari gambar + +Hal pertama yang Anda lakukan adalah membuat instance `OcrEngine` dan memberi tahu bahasa yang diharapkan. Di sini kami memilih Bahasa Inggris dan meminta output plain‑text, yang ideal untuk pemrosesan lanjutan. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Mengapa ini penting:** Menetapkan bahasa mempersempit set karakter, meningkatkan akurasi. Mode plain‑text menghapus informasi tata letak yang biasanya tidak Anda perlukan ketika hanya ingin mengekstrak teks dari gambar. + +--- + +## Langkah 2: Muat gambar untuk OCR – cara OCR gambar + +Sekarang kami memberi mesin gambar yang sebenarnya. Helper `Image.load` memahami format umum (PNG, JPEG, TIFF) dan mengabstraksi keanehan file‑IO. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Tip:** Jika gambar sumber Anda berukuran besar, pertimbangkan untuk mengubah ukurannya sebelum mengirim ke mesin; dimensi yang lebih kecil dapat mengurangi penggunaan memori GPU tanpa mengurangi kualitas pengenalan. + +--- + +## Langkah 3: Konfigurasikan Model Aspose AI – mengenali teks dari faktur + +Aspose AI dilengkapi dengan model GGUF kecil yang dapat diunduh otomatis. Contoh ini menggunakan repositori `Qwen2.5‑3B‑Instruct‑GGUF`, terkuantisasi menjadi `q4_k_m`. Kami juga memberi tahu runtime untuk mengalokasikan 20 lapisan pada GPU, yang menyeimbangkan kecepatan dan penggunaan VRAM. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Di balik layar:** Model terkuantisasi berukuran sekitar 1,5 GB di disk, hanya sebagian kecil dari model presisi penuh, namun tetap menangkap nuansa linguistik yang cukup untuk menandai kesalahan ejaan OCR yang umum. + +--- + +## Langkah 4: Inisialisasi AsposeAI dan lampirkan post‑processor pemeriksaan ejaan + +Aspose AI menyertakan post‑processor pemeriksaan ejaan siap pakai. Dengan melampirkannya, setiap hasil OCR akan dibersihkan secara otomatis. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Mengapa menggunakan post‑processor?** Mesin OCR sering salah membaca “Invoice” menjadi “Invo1ce” atau “Total” menjadi “T0tal”. Pemeriksaan ejaan menjalankan model bahasa ringan pada string mentah dan memperbaiki kesalahan tersebut tanpa Anda menulis kamus khusus. + +--- + +## Langkah 5: Jalankan post‑processor pemeriksaan ejaan pada hasil OCR + +Dengan semua terhubung, satu panggilan menghasilkan teks yang telah dikoreksi. Kami juga mencetak versi asli dan versi bersih sehingga Anda dapat melihat perbaikannya. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Output tipikal untuk faktur mungkin terlihat seperti ini: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Perhatikan bagaimana “Invo1ce” berubah menjadi kata yang tepat “Invoice”. Itulah kekuatan pemeriksaan ejaan AI bawaan. + +--- + +## Langkah 6: Lepaskan sumber daya GPU – lepaskan sumber daya GPU dengan aman + +Jika Anda menjalankan ini dalam layanan yang berjalan lama (mis., API web yang memproses puluhan faktur per menit), Anda harus membebaskan konteks GPU setelah setiap batch. Jika tidak, Anda akan melihat kebocoran memori dan akhirnya mendapatkan error “CUDA out of memory”. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Tips pro:** Panggil `free_resources()` di dalam blok `finally` atau context manager sehingga selalu dijalankan, bahkan jika terjadi pengecualian. + +--- + +## Contoh Kerja Lengkap + +Menggabungkan semua bagian memberikan Anda skrip mandiri yang dapat Anda masukkan ke proyek apa pun. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Simpan file, sesuaikan path ke gambar Anda, dan jalankan `python extract_text_from_image.py`. Anda akan melihat teks faktur yang telah dibersihkan tercetak di konsol. + +--- + +## Pertanyaan yang Sering Diajukan (FAQ) + +**Q: Apakah ini bekerja pada mesin hanya CPU?** +A: Tentu saja. Jika tidak ada GPU yang terdeteksi, Aspose AI beralih ke eksekusi CPU, meskipun akan lebih lambat. Anda dapat memaksa CPU dengan mengatur `model_cfg.gpu_layers = 0`. + +**Q: Bagaimana jika faktur saya dalam bahasa selain Bahasa Inggris?** +A: Ubah `ocr_engine.language` ke nilai enum yang sesuai (mis., `aocr.Language.Spanish`). Model pemeriksaan ejaan bersifat multibahasa, tetapi Anda mungkin mendapatkan hasil yang lebih baik dengan model khusus bahasa. + +**Q: Bisakah saya memproses beberapa gambar dalam loop?** +A: Ya. Pindahkan langkah pemuatan, pengenalan, dan post‑processing ke dalam loop `for`. Ingat untuk memanggil `ocr_ai.free_resources()` setelah loop atau setelah setiap batch jika Anda menggunakan kembali instance AI yang sama. + +**Q: Seberapa besar ukuran unduhan model?** +A: Sekitar 1,5 GB untuk versi terkuantisasi `q4_k_m`. Model ini di‑cache setelah run pertama, sehingga eksekusi selanjutnya menjadi instan. + +--- + +## Kesimpulan + +Dalam tutorial ini kami menunjukkan cara **mengekstrak teks dari gambar** menggunakan Aspose OCR, mengonfigurasi model AI kecil, menerapkan post‑processor pemeriksaan ejaan, dan dengan aman **melepaskan sumber daya GPU**. Alur kerja mencakup semua hal mulai dari memuat gambar hingga membersihkan setelah selesai, memberi Anda pipeline yang handal untuk skenario **mengenali teks dari faktur**. + +Langkah selanjutnya? Coba ganti pemeriksaan ejaan dengan model ekstraksi entitas khusus + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/indonesian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/indonesian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..ed64e5356 --- /dev/null +++ b/ocr/indonesian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,216 @@ +--- +category: general +date: 2026-05-03 +description: Cara melakukan OCR batch pada gambar menggunakan Aspose OCR dan pemeriksaan + ejaan AI. Pelajari cara mengekstrak teks dari gambar, menerapkan pemeriksaan ejaan, + memanfaatkan sumber daya AI gratis, dan memperbaiki kesalahan OCR. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: id +og_description: Cara memproses batch OCR gambar menggunakan Aspose OCR dan pemeriksaan + ejaan AI. Ikuti panduan langkah demi langkah untuk mengekstrak teks dari gambar, + menerapkan pemeriksaan ejaan, memanfaatkan sumber daya AI gratis, dan memperbaiki + kesalahan OCR. +og_title: Cara Melakukan OCR Batch dengan Aspose OCR – Tutorial Python Lengkap +tags: +- OCR +- Python +- AI +- Aspose +title: Cara Batch OCR dengan Aspose OCR – Panduan Python Lengkap +url: /id/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Cara Batch OCR dengan Aspose OCR – Panduan Python Lengkap + +Pernah bertanya-tanya **cara batch OCR** seluruh folder PDF yang dipindai atau foto tanpa menulis skrip terpisah untuk setiap file? Anda tidak sendirian. Dalam banyak alur kerja dunia nyata Anda perlu **mengekstrak teks dari gambar**, membersihkan kesalahan ejaan, dan akhirnya membebaskan sumber daya AI yang telah Anda alokasikan. Tutorial ini menunjukkan secara tepat cara melakukannya dengan Aspose OCR, post‑processor AI ringan, dan beberapa baris Python. + +Kami akan menjelaskan cara menginisialisasi mesin OCR, menghubungkan AI spell‑checker, melakukan loop pada direktori gambar, dan membersihkan model setelahnya. Pada akhir tutorial Anda akan memiliki skrip siap‑jalankan yang **corrects OCR errors** secara otomatis dan melepaskan **free AI resources** sehingga GPU Anda tetap bahagia. + +## Apa yang Anda Butuhkan + +- Python 3.9+ (kode menggunakan type‑hints tetapi bekerja pada versi 3.x sebelumnya) +- `asposeocr` package (`pip install asposeocr`) – paket ini menyediakan mesin OCR. +- Akses ke model Hugging Face `bartowski/Qwen2.5-3B-Instruct-GGUF` (diunduh secara otomatis). +- GPU dengan setidaknya beberapa GB VRAM (skrip mengatur `gpu_layers = 30`, Anda dapat menurunkannya jika diperlukan). + +Tidak ada layanan eksternal, tidak ada API berbayar – semuanya berjalan secara lokal. + +--- + +## Langkah 1: Siapkan Mesin OCR – **How to Batch OCR** Secara Efisien + +Sebelum kita dapat memproses seribu gambar, kita membutuhkan mesin OCR yang solid. Aspose OCR memungkinkan kita memilih bahasa dan mode pengenalan dalam satu panggilan. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Why this matters:** Mengatur `recognize_mode` ke `Plain` menjaga output tetap ringan, yang ideal ketika Anda berencana menjalankan spell‑check nanti. Jika Anda membutuhkan informasi tata letak, Anda dapat beralih ke `Layout`, tetapi itu menambah beban yang mungkin tidak Anda inginkan dalam pekerjaan batch. + +> **Pro tip:** Jika Anda menangani pemindaian multibahasa, Anda dapat memberikan daftar seperti `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Langkah 2: Inisialisasi AI Post‑Processor – **Apply Spell Check** pada Output OCR + +Aspose AI dilengkapi dengan post‑processor bawaan yang dapat menjalankan model apa pun yang Anda inginkan. Di sini kami mengambil model Qwen 2.5 yang terkuantisasi dari Hugging Face dan menghubungkan rutin spell‑check. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Why this matters:** Model ini terkuantisasi (`q4_k_m`), yang mengurangi penggunaan memori secara signifikan sambil tetap memberikan pemahaman bahasa yang layak. Dengan memanggil `set_post_processor` kami memberi tahu Aspose AI untuk menjalankan langkah **apply spell check** secara otomatis pada setiap string yang kami berikan. + +> **Watch out:** Jika GPU Anda tidak dapat menangani 30 lapisan, turunkan jumlahnya menjadi 15 atau bahkan 5 – skrip tetap akan berfungsi, hanya sedikit lebih lambat. + +--- + +## Langkah 3: Jalankan OCR dan **Correct OCR Errors** pada Gambar Tunggal + +Sekarang karena mesin OCR dan AI spell‑checker sudah siap, kami menggabungkannya. Fungsi ini memuat gambar, mengekstrak teks mentah, kemudian menjalankan AI post‑processor untuk membersihkannya. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Why this matters:** Memasukkan string OCR mentah langsung ke dalam model AI memberi kami proses **correct OCR errors** tanpa menulis regex atau kamus khusus. Model mengetahui konteks, sehingga dapat memperbaiki “recieve” → “receive” dan kesalahan yang lebih halus. + +--- + +## Langkah 4: **Extract Text from Images** secara Massal – Loop Batch Sebenarnya + +Di sinilah keajaiban **how to batch OCR** bersinar. Kami mengiterasi direktori, melewatkan file yang tidak didukung, dan menulis setiap output yang telah dikoreksi ke file `.txt`. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Output yang Diharapkan + +Untuk gambar yang berisi kalimat *“The quick brown fox jumps over the lazzy dog.”* Anda akan melihat file teks dengan: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Perhatikan bahwa double “z” telah dikoreksi secara otomatis – itulah AI spell‑check yang beraksi. + +**Why this matters:** Dengan membuat objek OCR dan AI **sekali** dan menggunakannya kembali, kita menghindari beban memuat model untuk setiap file. Ini adalah cara paling efisien untuk **how to batch OCR** dalam skala besar. + +--- + +## Langkah 5: Bersihkan – **Free AI Resources** dengan Benar + +Saat Anda selesai, memanggil `free_resources()` melepaskan memori GPU, konteks CUDA, dan file sementara apa pun yang dibuat model. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Melewatkan langkah ini dapat meninggalkan alokasi GPU yang menggantung, yang mungkin menyebabkan proses Python berikutnya crash atau menghabiskan VRAM. Anggap saja ini sebagai bagian “mematikan lampu” dari pekerjaan batch. + +--- + +## Kesulitan Umum & Tips Tambahan + +| Issue | What to Look For | Fix | +|-------|------------------|-----| +| **Out‑of‑memory errors** | GPU kehabisan memori setelah beberapa lusin gambar | Kurangi `gpu_layers` atau beralih ke CPU (`model_cfg.gpu_layers = 0`). | +| **Missing language pack** | OCR mengembalikan string kosong | Pastikan versi `asposeocr` menyertakan data bahasa Inggris; instal ulang jika diperlukan. | +| **Non‑image files** | Skrip crash pada file `.pdf` yang tidak diinginkan | Guard `if not file_name.lower().endswith(...)` sudah melewatkannya. | +| **Spell‑check not applied** | Output terlihat identik dengan OCR mentah | Pastikan `ai_processor.set_post_processor` dipanggil sebelum loop. | +| **Slow batch speed** | Membutuhkan >5 detik per gambar | Aktifkan `model_cfg.allow_auto_download = "false"` setelah run pertama, sehingga model tidak diunduh ulang setiap kali. | + +**Pro tip:** Jika Anda perlu **extract text from images** dalam bahasa selain Inggris, cukup ubah `ocr_engine.language` ke enum yang sesuai (mis., `aocr.Language.French`). AI post‑processor yang sama tetap akan menerapkan spell‑check, tetapi Anda mungkin menginginkan model khusus bahasa untuk hasil terbaik. + +--- + +## Ringkasan & Langkah Selanjutnya + +Kami telah membahas seluruh pipeline untuk **how to batch OCR**: + +1. **Initialize** mesin OCR plain‑text untuk bahasa Inggris. +2. **Configure** model AI spell‑check dan mengikatnya sebagai post‑processor. +3. **Run** OCR pada setiap gambar dan biarkan AI **correct OCR errors** secara otomatis. +4. **Loop** pada direktori untuk **extract text from images** secara massal. +5. **Free AI resources** setelah pekerjaan selesai. + +Dari sini Anda dapat: + +- Menyalurkan teks yang telah dikoreksi ke pipeline NLP downstream (analisis sentimen, ekstraksi entitas, dll.). +- Mengganti post‑processor spell‑check dengan summarizer khusus dengan memanggil `ai_processor.set_post_processor(your_custom_func, {})`. +- Memparallelkan loop folder dengan `concurrent.futures.ThreadPoolExecutor` jika GPU Anda dapat menangani multiple streams. + +--- + +## Pemikiran Akhir + +Batching OCR tidak harus menjadi pekerjaan yang melelahkan. Dengan memanfaatkan Aspose OCR bersama model AI ringan, Anda mendapatkan **one‑stop solution** yang **extracts text from images**, **applies spell check**, **corrects OCR errors**, dan **frees AI resources** secara bersih. Jalankan skrip pada folder percobaan, sesuaikan jumlah lapisan GPU agar cocok dengan perangkat keras Anda, dan Anda akan memiliki pipeline siap produksi dalam hitungan menit. + +Ada pertanyaan tentang menyesuaikan model, menangani PDF, atau mengintegrasikan ini ke layanan web? Tinggalkan komentar di bawah atau hubungi saya di GitHub. Selamat coding, semoga OCR Anda selalu akurat! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/indonesian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/indonesian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..ff270ee63 --- /dev/null +++ b/ocr/indonesian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,299 @@ +--- +category: general +date: 2026-05-03 +description: Tutorial OCR Python yang menunjukkan cara memuat file gambar PNG, mengenali + teks dari gambar, dan sumber daya AI gratis untuk pemrosesan OCR batch. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: id +og_description: Tutorial OCR Python memandu Anda melalui proses memuat gambar PNG, + mengenali teks dari gambar, dan menangani sumber daya AI gratis untuk pemrosesan + OCR batch. +og_title: Tutorial OCR Python – OCR Batch Cepat dengan Sumber Daya AI Gratis +tags: +- OCR +- Python +- AI +title: Tutorial OCR Python – Pemrosesan OCR Batch menjadi Mudah +url: /id/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Tutorial OCR Python – Memproses OCR Batch dengan Mudah + +Pernah membutuhkan **python ocr tutorial** yang benar‑benar memungkinkan Anda menjalankan OCR pada puluhan file PNG tanpa membuat Anda stres? Anda tidak sendirian. Dalam banyak proyek dunia nyata Anda harus **load png image** file, memberi mereka ke mesin, dan kemudian membersihkan sumber daya AI setelah selesai. + +Dalam panduan ini kami akan membimbing Anda melalui contoh lengkap yang siap dijalankan yang menunjukkan secara tepat cara **recognize text from image** file, memprosesnya secara batch, dan membebaskan memori AI yang mendasarinya. Pada akhir tutorial Anda akan memiliki skrip mandiri yang dapat Anda masukkan ke proyek mana pun—tanpa tambahan yang tidak perlu, hanya esensialnya. + +## Apa yang Anda Butuhkan + +- Python 3.10 atau lebih baru (sintaks yang digunakan di sini mengandalkan f‑strings dan type hints) +- Sebuah perpustakaan OCR yang menyediakan metode `engine.recognize` – untuk demo kami mengasumsikan paket fiktif `aocr`, tetapi Anda dapat menggantinya dengan Tesseract, EasyOCR, dll. +- Modul pembantu `ai` yang ditunjukkan dalam cuplikan kode (menangani inisialisasi model dan pembersihan sumber daya) +- Sebuah folder berisi file PNG yang ingin Anda proses + +Jika Anda belum memiliki `aocr` atau `ai` terpasang, Anda dapat menirunya dengan stub – lihat bagian “Optional Stubs” di akhir. + +## Langkah 1: Inisialisasi AI Engine (Bebaskan Sumber Daya AI) + +Sebelum Anda memberi gambar apa pun ke pipeline OCR, model yang mendasarinya harus siap. Menginisialisasi hanya sekali menghemat memori dan mempercepat pekerjaan batch. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Mengapa ini penting:** +Memanggil `ai.initialize` berulang‑ulang untuk setiap gambar akan mengalokasikan memori GPU berulang kali, yang pada akhirnya dapat membuat skrip crash. Dengan memeriksa `ai.is_initialized()` kita menjamin alokasi tunggal – itulah prinsip “bebaskan sumber daya AI”. + +## Langkah 2: Muat File Gambar PNG untuk Pemrosesan OCR Batch + +Sekarang kita mengumpulkan semua file PNG yang ingin diproses melalui OCR. Menggunakan `pathlib` membuat kode tetap lintas‑OS. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Kasus tepi:** +Jika folder berisi file non‑PNG (misalnya JPEG) mereka akan diabaikan, mencegah `engine.recognize` mengalami kegagalan karena format yang tidak didukung. + +## Langkah 3: Jalankan OCR pada Setiap Gambar dan Terapkan Post‑Processing + +Dengan engine siap dan daftar file sudah dipersiapkan, kita dapat melintasi gambar‑gambar, mengekstrak teks mentah, dan menyerahkannya ke post‑processor yang membersihkan artefak OCR umum (seperti pemisahan baris yang tidak diinginkan). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Mengapa kami memisahkan pemuatan dari pengenalan:** +`aocr.Image.load` mungkin melakukan decoding secara lazy, yang lebih cepat untuk batch besar. Menjaga langkah pemuatan secara eksplisit juga memudahkan penggantian dengan perpustakaan gambar lain jika Anda kemudian perlu menangani file JPEG atau TIFF. + +## Langkah 4: Bersihkan – Bebaskan Sumber Daya AI Setelah Batch Selesai + +Setelah batch selesai, kita harus melepaskan model untuk menghindari kebocoran memori, terutama pada mesin yang memiliki GPU. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Menyatukan Semua – Skrip Lengkap + +Berikut adalah satu file yang menyatukan empat langkah menjadi alur kerja yang koheren. Simpan sebagai `batch_ocr.py` dan jalankan dari command line. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Output yang Diharapkan + +Menjalankan skrip pada folder yang berisi tiga PNG mungkin menghasilkan: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +File `ocr_results.txt` akan berisi pemisah yang jelas untuk setiap gambar diikuti oleh teks OCR yang telah dibersihkan. + +## Stub Opsional untuk aocr & ai (Jika Anda Tidak Memiliki Paket Asli) + +Jika Anda hanya ingin menguji alur tanpa mengunduh perpustakaan OCR yang berat, Anda dapat membuat modul mock minimal: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Letakkan folder‑folder ini di samping `batch_ocr.py` dan skrip akan berjalan, mencetak hasil mock. + +## Tips Pro & Kesalahan Umum + +- **Lonjakan memori:** Jika Anda memproses ribuan PNG beresolusi tinggi, pertimbangkan untuk mengubah ukuran gambar sebelum OCR. `aocr.Image.load` sering menerima argumen `max_size`. +- **Penanganan Unicode:** Selalu buka file output dengan `encoding="utf-8"`; mesin OCR dapat menghasilkan karakter non‑ASCII. +- **Paralelisme:** Untuk OCR yang CPU‑bound Anda dapat membungkus `ocr_batch` dalam `concurrent.futures.ThreadPoolExecutor`. Ingat untuk menjaga satu instance `ai` – membuat banyak thread yang masing‑masing memanggil `ai.initialize` akan menggagalkan tujuan “bebaskan sumber daya AI”. +- **Ketahanan terhadap error:** Bungkus loop per‑gambar dalam blok `try/except` sehingga satu PNG yang rusak tidak menghentikan seluruh batch. + +## Kesimpulan + +Anda kini memiliki **python ocr tutorial** yang menunjukkan cara **load png image** file, melakukan **batch OCR processing**, dan mengelola **free AI resources** secara bertanggung jawab. Contoh lengkap yang dapat dijalankan ini memperlihatkan secara tepat cara **recognize text from image** objek dan membersihkan setelahnya, sehingga Anda dapat menyalin‑tempelnya ke proyek Anda sendiri tanpa harus mencari potongan kode yang hilang. + +Siap untuk langkah selanjutnya? Coba ganti modul `aocr` dan `ai` yang di‑stub dengan perpustakaan nyata seperti `pytesseract` dan `torchvision`. Anda juga dapat memperluas skrip untuk menghasilkan JSON, mengirim hasil ke basis data, atau mengintegrasikannya dengan bucket penyimpanan cloud. Langit adalah batasnya—selamat coding! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/indonesian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/indonesian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..dfff9e66f --- /dev/null +++ b/ocr/indonesian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,255 @@ +--- +category: general +date: 2026-05-03 +description: Pelajari cara menjalankan OCR pada gambar dan mengekstrak teks beserta + koordinatnya menggunakan pengenalan OCR terstruktur. Kode Python langkah demi langkah + disertakan. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: id +og_description: Jalankan OCR pada gambar dan dapatkan teks dengan koordinat menggunakan + pengenalan OCR terstruktur. Contoh Python lengkap dengan penjelasan. +og_title: Jalankan OCR pada gambar – Tutorial Ekstraksi Teks Terstruktur +tags: +- OCR +- Python +- Computer Vision +title: Jalankan OCR pada gambar – Panduan Lengkap untuk Ekstraksi Teks Terstruktur +url: /id/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Jalankan OCR pada gambar – Panduan Lengkap untuk Ekstraksi Teks Terstruktur + +Pernah perlu **menjalankan OCR pada gambar** tetapi tidak yakin bagaimana cara mempertahankan posisi tepat setiap kata? Anda tidak sendirian. Dalam banyak proyek—pemindaian struk, digitalisasi formulir, atau pengujian UI—Anda membutuhkan tidak hanya teks mentah tetapi juga kotak pembatas yang memberi tahu di mana setiap baris berada pada gambar. + +Tutorial ini menunjukkan cara praktis untuk *menjalankan OCR pada gambar* menggunakan mesin **aocr**, meminta **pengenalan OCR terstruktur**, dan kemudian memproses hasilnya sambil mempertahankan geometri. Pada akhir tutorial Anda akan dapat **mengekstrak teks dengan koordinat** hanya dalam beberapa baris Python, dan Anda akan memahami mengapa mode terstruktur penting untuk tugas-tugas selanjutnya. + +## Apa yang Akan Anda Pelajari + +- Cara menginisialisasi mesin OCR untuk **pengenalan OCR terstruktur**. +- Cara memberi gambar ke mesin dan menerima hasil mentah yang mencakup batas baris. +- Cara menjalankan post‑processor yang membersihkan teks tanpa kehilangan geometri. +- Cara mengiterasi baris akhir dan mencetak setiap potongan teks bersama dengan kotak pembatasnya. + +Tidak ada sulap, tidak ada langkah tersembunyi—hanya contoh lengkap yang dapat dijalankan dan Anda dapat menambahkannya ke proyek Anda. + +--- + +## Prasyarat + +Sebelum kita mulai, pastikan Anda telah menginstal hal‑hal berikut: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Anda juga memerlukan file gambar (`input_image.png` atau `.jpg`) yang berisi teks yang jelas dan dapat dibaca. Apa saja mulai dari faktur yang dipindai hingga tangkapan layar dapat digunakan, selama mesin OCR dapat melihat karakter‑karakternya. + +--- + +## Langkah 1: Inisialisasi mesin OCR untuk pengenalan terstruktur + +Hal pertama yang kita lakukan adalah membuat instance `aocr.Engine()` dan memberi tahu bahwa kita menginginkan **pengenalan OCR terstruktur**. Mode terstruktur mengembalikan tidak hanya teks biasa tetapi juga data geometris (rektangel pembatas) untuk setiap baris, yang penting ketika Anda perlu memetakan teks kembali ke gambar. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Mengapa ini penting:** +> Dalam mode default mesin mungkin hanya memberikan satu string kata‑kata yang digabungkan. Mode terstruktur memberi Anda hierarki halaman → baris → kata, masing‑masing dengan koordinat, sehingga jauh lebih mudah menumpangkan hasil pada gambar asli atau memasukkannya ke model yang memperhatikan tata letak. + +--- + +## Langkah 2: Jalankan OCR pada gambar dan dapatkan hasil mentah + +Sekarang kita memberi gambar ke mesin. Pemanggilan `recognize` mengembalikan objek `OcrResult` yang berisi kumpulan baris, masing‑masing dengan kotak pembatasnya. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +Pada titik ini `raw_result.lines` berisi objek dengan dua atribut penting: + +- `text` – string yang dikenali untuk baris tersebut. +- `bounds` – tuple seperti `(x, y, width, height)` yang menggambarkan posisi baris. + +--- + +## Langkah 3: Post‑process sambil mempertahankan geometri + +Output OCR mentah sering berisik: karakter‑karakter stray, spasi yang salah tempat, atau masalah pemenggalan baris. Fungsi `ai.run_postprocessor` membersihkan teks tetapi **mempertahankan geometri asli** tetap utuh, sehingga Anda masih memiliki koordinat yang akurat. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Tips pro:** Jika Anda memiliki kosakata khusus domain (misalnya kode produk), beri kamus khusus ke post‑processor untuk meningkatkan akurasi. + +--- + +## Langkah 4: Ekstrak teks dengan koordinat – iterasi dan tampilkan + +Akhirnya, kita melakukan loop pada baris‑baris yang telah dibersihkan, mencetak kotak pembatas setiap baris bersama teksnya. Inilah inti dari **mengekstrak teks dengan koordinat**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Output yang Diharapkan + +Dengan asumsi gambar masukan berisi dua baris: “Invoice #12345” dan “Total: $89.99”, Anda akan melihat sesuatu seperti: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +Tuple pertama adalah `(x, y, width, height)` dari baris pada gambar asli, memungkinkan Anda menggambar kotak, menyorot teks, atau memasukkan koordinat ke sistem lain. + +--- + +## Visualisasi Hasil (Opsional) + +Jika Anda ingin melihat kotak pembatas ditumpangkan pada gambar, Anda dapat menggunakan Pillow (PIL) untuk menggambar persegi panjang. Berikut cuplikan singkat; lewati jika Anda hanya membutuhkan data mentah. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image contoh menampilkan kotak pembatas](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +Teks alt di atas mengandung **kata kunci utama**, memenuhi persyaratan SEO untuk atribut alt gambar. + +--- + +## Mengapa Pengenalan OCR Terstruktur Lebih Baik daripada Ekstraksi Teks Sederhana + +Anda mungkin bertanya, “Bukankah saya cukup menjalankan OCR dan mendapatkan teks? Mengapa harus repot dengan geometri?” + +- **Konteks spasial:** Ketika Anda perlu memetakan bidang pada formulir (misalnya “Tanggal” di sebelah nilai tanggal), koordinat memberi tahu *di mana* data berada. +- **Tata letak multi‑kolom:** Teks linier sederhana kehilangan urutan; data terstruktur mempertahankan urutan kolom. +- **Akurasi post‑processing:** Mengetahui ukuran kotak membantu Anda memutuskan apakah sebuah kata adalah judul, catatan kaki, atau artefak stray. + +Singkatnya, **pengenalan OCR terstruktur** memberi Anda fleksibilitas untuk membangun pipeline yang lebih pintar—baik Anda memasukkan data ke basis data, membuat PDF yang dapat dicari, atau melatih model pembelajaran mesin yang menghormati tata letak. + +--- + +## Kasus Pinggiran Umum dan Cara Menanganinya + +| Situasi | Hal yang Perlu Diperhatikan | Solusi yang Disarankan | +|-----------|-------------------|---------------| +| **Gambar diputar atau miring** | Kotak pembatas mungkin tidak sejajar. | Lakukan pra‑proses dengan deskewing (misalnya `warpAffine` dari OpenCV). | +| **Font sangat kecil** | Mesin mungkin melewatkan karakter, menghasilkan baris kosong. | Tingkatkan resolusi gambar atau gunakan `ocr_engine.set_dpi(300)`. | +| **Bahasa campuran** | Model bahasa yang salah dapat menghasilkan teks berantakan. | Atur `ocr_engine.language = ["en", "de"]` sebelum pengenalan. | +| **Kotak tumpang tindih** | Post‑processor mungkin menggabungkan dua baris secara tidak sengaja. | Verifikasi `line.bounds` setelah pemrosesan; sesuaikan ambang pada `ai.run_postprocessor`. | + +Menangani skenario ini sejak awal menghemat banyak masalah di kemudian hari, terutama ketika Anda menskalakan solusi ke ratusan dokumen per hari. + +--- + +## Skrip End‑to‑End Lengkap + +Berikut adalah program lengkap yang siap dijalankan dan menggabungkan semua langkah. Salin‑tempel, sesuaikan jalur gambar, dan Anda siap. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Menjalankan skrip ini akan: + +1. **Menjalankan OCR pada gambar** dengan mode terstruktur. +2. **Mengekstrak teks dengan koordinat** untuk setiap baris. +3. Secara opsional menghasilkan PNG beranotasi yang menampilkan kotak pembatas. + +--- + +## Kesimpulan + +Anda kini memiliki solusi mandiri yang solid untuk **menjalankan OCR pada gambar** dan **mengekstrak teks dengan koordinat** menggunakan **pengenalan OCR terstruktur**. Kode ini memperlihatkan setiap langkah—dari inisialisasi mesin hingga post‑processing dan verifikasi visual—sehingga Anda dapat menyesuaikannya untuk struk, formulir, atau dokumen visual apa pun yang memerlukan lokalisasi teks yang tepat. + +Apa selanjutnya? Coba ganti mesin `aocr` dengan pustaka lain (Tesseract, EasyOCR) dan lihat bagaimana output terstruktur mereka berbeda. Bereksperimenlah dengan strategi post‑processing lain, seperti pemeriksaan ejaan atau filter regex khusus, untuk meningkatkan akurasi pada domain Anda. Dan jika Anda membangun pipeline yang lebih besar, pertimbangkan menyimpan pasangan `(text, bounds)` ke dalam basis data untuk analisis di masa mendatang. + +Selamat coding, semoga proyek OCR Anda selalu akurat! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/italian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/italian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..14beb4c22 --- /dev/null +++ b/ocr/italian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,230 @@ +--- +category: general +date: 2026-05-03 +description: estrai il testo da un'immagine usando Aspose OCR e il controllo ortografico + AI. Scopri come eseguire l'OCR su un'immagine, caricare l'immagine per l'OCR, riconoscere + il testo da una fattura e rilasciare le risorse GPU. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: it +og_description: estrai il testo da un'immagine con Aspose OCR e il controllo ortografico + AI. Guida passo‑passo che copre come eseguire l'OCR su un'immagine, caricare l'immagine + per l'OCR e rilasciare le risorse GPU. +og_title: Estrai testo da immagine – Guida completa a OCR e correzione ortografica +tags: +- OCR +- Aspose +- AI +- Python +title: Estrai testo da immagine – OCR con Aspose AI Spell‑Check +url: /it/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# estrarre testo da immagine – Guida completa OCR & Spell‑Check + +Ti è mai capitato di dover **estrarre testo da immagine** ma non eri sicuro quale libreria ti offrisse sia velocità che precisione? Non sei l'unico. In molti progetti reali—pensa all'elaborazione di fatture, alla digitalizzazione di ricevute o alla scansione di contratti—ottenere testo pulito e ricercabile da un'immagine è il primo ostacolo. + +La buona notizia è che Aspose OCR, abbinato a un modello leggero Aspose AI, può gestire questo compito in poche righe di Python. In questo tutorial vedremo **come fare OCR su un'immagine**, caricare correttamente la foto, eseguire un correttore ortografico integrato e infine **rilasciare le risorse GPU** affinché la tua app rimanga a basso consumo di memoria. + +Alla fine di questa guida sarai in grado di **riconoscere testo da immagini di fatture**, correggere automaticamente gli errori OCR più comuni e mantenere la tua GPU pulita per il prossimo batch. + +--- + +## Di cosa avrai bisogno + +- Python 3.9 o versioni successive (il codice usa type hints ma funziona anche su versioni 3.x precedenti) +- Pacchetti `aspose-ocr` e `aspose-ai` (installali con `pip install aspose-ocr aspose-ai`) +- Una GPU abilitata CUDA è opzionale; lo script tornerà alla CPU se non ne trova una. +- Un'immagine di esempio, ad es. `sample_invoice.png`, posizionata in una cartella a cui puoi fare riferimento. + +Nessun framework ML pesante, nessun download di modelli enormi—solo un piccolo modello quantizzato Q4‑K‑M che si adatta comodamente alla maggior parte delle GPU. + +--- + +## Passo 1: Inizializzare il motore OCR – estrarre testo da immagine + +La prima cosa da fare è creare un'istanza di `OcrEngine` e indicare quale lingua ti aspetti. Qui scegliamo l'inglese e richiediamo un output in plain‑text, ideale per l'elaborazione successiva. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Perché è importante:** Impostare la lingua restringe il set di caratteri, migliorando la precisione. La modalità plain‑text rimuove le informazioni di layout che tipicamente non ti servono quando vuoi solo estrarre testo da immagine. + +--- + +## Passo 2: Caricare l'immagine per OCR – come fare OCR su un'immagine + +Ora forniamo al motore un'immagine reale. L'helper `Image.load` riconosce i formati più comuni (PNG, JPEG, TIFF) e astrae le particolarità del file‑IO. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Suggerimento:** Se le tue immagini di origine sono grandi, considera di ridimensionarle prima di inviarle al motore; dimensioni più piccole possono ridurre l'uso di memoria GPU senza compromettere la qualità del riconoscimento. + +--- + +## Passo 3: Configurare il modello Aspose AI – riconoscere testo da fattura + +Aspose AI include un piccolo modello GGUF che puoi scaricare automaticamente. L'esempio utilizza il repository `Qwen2.5‑3B‑Instruct‑GGUF`, quantizzato a `q4_k_m`. Indichiamo inoltre al runtime di allocare 20 layer sulla GPU, bilanciando velocità e utilizzo della VRAM. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Dietro le quinte:** Il modello quantizzato occupa circa 1,5 GB su disco, una frazione di un modello a piena precisione, ma cattura comunque abbastanza sfumature linguistiche da segnalare gli errori tipici dell'OCR. + +--- + +## Passo 4: Inizializzare AsposeAI e collegare il post‑processore di correzione ortografica + +Aspose AI include un post‑processore di correzione ortografica pronto all'uso. Collegandolo, ogni risultato OCR verrà pulito automaticamente. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Perché usare il post‑processore?** I motori OCR spesso leggono “Invoice” come “Invo1ce” o “Total” come “T0tal”. La correzione ortografica esegue un modello linguistico leggero sulla stringa grezza e corregge quegli errori senza che tu debba scrivere un dizionario personalizzato. + +--- + +## Passo 5: Eseguire il post‑processore di correzione ortografica sul risultato OCR + +Con tutto collegato, una singola chiamata restituisce il testo corretto. Stampiamo anche entrambe le versioni, originale e pulita, così puoi vedere il miglioramento. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Un output tipico per una fattura potrebbe apparire così: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Nota come “Invo1ce” sia diventato la parola corretta “Invoice”. Questa è la potenza del correttore ortografico AI integrato. + +--- + +## Passo 6: Rilasciare le risorse GPU – rilasciare le risorse GPU in modo sicuro + +Se esegui questo in un servizio a lungo termine (ad esempio un'API web che elabora decine di fatture al minuto), devi liberare il contesto GPU dopo ogni batch. Altrimenti vedrai perdite di memoria e alla fine otterrai errori “CUDA out of memory”. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Consiglio professionale:** Chiama `free_resources()` all'interno di un blocco `finally` o di un context manager affinché venga sempre eseguito, anche in caso di eccezione. + +--- + +## Esempio completo funzionante + +Unire tutti i componenti ti fornisce uno script autonomo che puoi inserire in qualsiasi progetto. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Salva il file, regola il percorso della tua immagine ed esegui `python extract_text_from_image.py`. Dovresti vedere il testo della fattura pulito stampato sulla console. + +--- + +## Domande frequenti (FAQ) + +**Q: Funziona su macchine solo CPU?** +**A:** Assolutamente. Se non viene rilevata alcuna GPU, Aspose AI ricade sull'esecuzione CPU, anche se sarà più lenta. Puoi forzare la CPU impostando `model_cfg.gpu_layers = 0`. + +**Q: E se le mie fatture sono in una lingua diversa dall'inglese?** +**A:** Cambia `ocr_engine.language` al valore enum appropriato (ad es., `aocr.Language.Spanish`). Il modello di correzione ortografica è multilingue, ma potresti ottenere risultati migliori con un modello specifico per la lingua. + +**Q: Posso elaborare più immagini in un ciclo?** +**A:** Sì. Sposta semplicemente i passaggi di caricamento, riconoscimento e post‑processing all'interno di un ciclo `for`. Ricorda di chiamare `ocr_ai.free_resources()` dopo il ciclo o dopo ogni batch se riutilizzi la stessa istanza AI. + +**Q: Quanto è grande il download del modello?** +**A:** Circa 1,5 GB per la versione quantizzata `q4_k_m`. Viene memorizzato nella cache dopo la prima esecuzione, quindi le esecuzioni successive sono istantanee. + +--- + +## Conclusione + +In questo tutorial abbiamo mostrato come **estrarre testo da immagine** usando Aspose OCR, configurare un piccolo modello AI, applicare un post‑processore di correzione ortografica e rilasciare in modo sicuro le **risorse GPU**. Il flusso di lavoro copre tutto, dal caricamento dell'immagine alla pulizia finale, fornendoti una pipeline affidabile per scenari di **riconoscimento testo da fattura**. + +Passi successivi? Prova a sostituire il correttore ortografico con un modello personalizzato di estrazione di entità + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/italian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/italian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..662e268fe --- /dev/null +++ b/ocr/italian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,214 @@ +--- +category: general +date: 2026-05-03 +description: Come eseguire OCR batch di immagini usando Aspose OCR e il controllo + ortografico AI. Impara a estrarre testo dalle immagini, applicare il controllo ortografico, + risorse AI gratuite e correggere gli errori OCR. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: it +og_description: Come eseguire l'OCR batch di immagini usando Aspose OCR e il controllo + ortografico AI. Segui una guida passo‑passo per estrarre il testo dalle immagini, + applicare il controllo ortografico, liberare risorse AI e correggere gli errori + di OCR. +og_title: Come fare OCR in batch con Aspose OCR – Tutorial completo in Python +tags: +- OCR +- Python +- AI +- Aspose +title: Come eseguire OCR batch con Aspose OCR – Guida completa in Python +url: /it/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Come eseguire OCR batch con Aspose OCR – Guida completa in Python + +Ti sei mai chiesto **come fare OCR batch** su un'intera cartella di PDF o foto scansionate senza scrivere uno script separato per ogni file? Non sei solo. In molte pipeline reali dovrai **estrarre testo dalle immagini**, correggere errori ortografici e infine liberare le risorse AI che hai allocato. Questo tutorial ti mostra esattamente come farlo con Aspose OCR, un post‑processor AI leggero, e poche righe di Python. + +Passeremo in rassegna l'inizializzazione del motore OCR, l'integrazione di un correttore ortografico AI, l'iterazione su una directory di immagini e la pulizia del modello al termine. Alla fine avrai uno script pronto all'uso che **corregge automaticamente gli errori OCR** e rilascia **risorse AI gratuite** così la tua GPU rimane felice. + +## Cosa ti servirà + +- Python 3.9+ (il codice usa type‑hints ma funziona anche su versioni 3.x precedenti) +- Pacchetto `asposeocr` (`pip install asposeocr`) – fornisce il motore OCR. +- Accesso al modello Hugging Face `bartowski/Qwen2.5-3B-Instruct-GGUF` (scaricato automaticamente). +- Una GPU con almeno qualche GB di VRAM (lo script imposta `gpu_layers = 30`, puoi ridurlo se necessario). + +Nessun servizio esterno, nessuna API a pagamento – tutto gira localmente. + +--- + +## Passo 1: Configura il motore OCR – **Come fare OCR batch** in modo efficiente + +Prima di poter elaborare mille immagini abbiamo bisogno di un motore OCR solido. Aspose OCR ci permette di scegliere lingua e modalità di riconoscimento con una singola chiamata. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Perché è importante:** Impostare `recognize_mode` su `Plain` mantiene l'output leggero, ideale quando prevedi di eseguire un controllo ortografico in seguito. Se ti servissero informazioni di layout, passeresti a `Layout`, ma ciò aggiunge overhead che probabilmente non vuoi in un lavoro batch. + +> **Consiglio professionale:** Se stai gestendo scansioni multilingue, puoi passare una lista come `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Passo 2: Inizializza il post‑processor AI – **Applica il controllo ortografico** all'output OCR + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Perché è importante:** Il modello è quantizzato (`q4_k_m`), il che riduce drasticamente l'uso di memoria mantenendo una buona comprensione linguistica. Chiamando `set_post_processor` diciamo ad Aspose AI di eseguire automaticamente il passaggio **apply spell check** su qualsiasi stringa gli forniamo. + +> **Attenzione:** Se la tua GPU non riesce a gestire 30 layer, riduci il numero a 15 o anche 5 – lo script funzionerà comunque, solo un po' più lentamente. + +--- + +## Passo 3: Esegui OCR e **Correggi gli errori OCR** su un'immagine singola + +Ora che sia il motore OCR sia il correttore ortografico AI sono pronti, li combiniamo. Questa funzione carica un'immagine, estrae il testo grezzo, poi esegue il post‑processor AI per pulirlo. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Perché è importante:** Inviare direttamente la stringa OCR grezza al modello AI ci fornisce un passaggio **correct OCR errors** senza scrivere regex o dizionari personalizzati. Il modello comprende il contesto, così può correggere “recieve” → “receive” e anche errori più sottili. + +--- + +## Passo 4: **Estrai testo dalle immagini** in blocco – Il vero ciclo batch + +Qui è dove la magia di **come fare OCR batch** brilla. Iteriamo su una directory, saltiamo i file non supportati e scriviamo ogni output corretto in un file `.txt`. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Output previsto + +Per un'immagine contenente la frase *“The quick brown fox jumps over the lazzy dog.”* vedrai un file di testo con: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Nota che la doppia “z” è stata corretta automaticamente – è il controllo ortografico AI in azione. + +**Perché è importante:** Creando gli oggetti OCR e AI **una sola volta** e riutilizzandoli, evitiamo l'overhead di caricare il modello per ogni file. Questo è il modo più efficiente per **come fare OCR batch** su larga scala. + +--- + +## Passo 5: Pulizia – **Libera le risorse AI** correttamente + +Quando hai finito, chiamare `free_resources()` rilascia la memoria GPU, i contesti CUDA e tutti i file temporanei creati dal modello. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Saltare questo passaggio può lasciare allocazioni GPU pendenti, che potrebbero far crashare i processi Python successivi o consumare VRAM. Consideralo la parte “spegni le luci” di un lavoro batch. + +--- + +## Problemi comuni & consigli extra + +| Problema | Cosa controllare | Soluzione | +|----------|------------------|-----------| +| **Errori out‑of‑memory** | La GPU si esaurisce dopo qualche decina di immagini | Riduci `gpu_layers` o passa alla CPU (`model_cfg.gpu_layers = 0`). | +| **Pacchetto lingua mancante** | OCR restituisce stringhe vuote | Assicurati che la versione di `asposeocr` includa i dati della lingua inglese; reinstalla se necessario. | +| **File non‑immagine** | Lo script crasha su un `.pdf` errante | La guardia `if not file_name.lower().endswith(...)` li salta già. | +| **Controllo ortografico non applicato** | L'output è identico all'OCR grezzo | Verifica che `ai_processor.set_post_processor` sia stato chiamato prima del ciclo. | +| **Velocità batch lenta** | Richiede >5 secondi per immagine | Abilita `model_cfg.allow_auto_download = "false"` dopo la prima esecuzione, così il modello non viene riscaricato ogni volta. | + +**Consiglio professionale:** Se hai bisogno di **estrarre testo dalle immagini** in una lingua diversa dall'inglese, cambia semplicemente `ocr_engine.language` nell'enum appropriato (ad esempio `aocr.Language.French`). Lo stesso post‑processor AI applicherà comunque il controllo ortografico, ma potresti voler un modello specifico per la lingua per ottenere i migliori risultati. + +--- + +## Riepilogo & prossimi passi + +Abbiamo coperto l'intera pipeline per **come fare OCR batch**: + +1. **Inizializza** un motore OCR plain‑text per l'inglese. +2. **Configura** un modello AI di controllo ortografico e collegalo come post‑processor. +3. **Esegui** OCR su ogni immagine e lascia che l'AI **corregga gli errori OCR** automaticamente. +4. **Itera** su una directory per **estrarre testo dalle immagini** in blocco. +5. **Libera le risorse AI** una volta terminato il lavoro. + +Da qui potresti: + +- Inoltrare il testo corretto a una pipeline NLP a valle (analisi del sentiment, estrazione di entità, ecc.). +- Sostituire il post‑processor di controllo ortografico con un riepilogatore personalizzato chiamando `ai_processor.set_post_processor(your_custom_func, {})`. +- Parallelizzare il ciclo della cartella con `concurrent.futures.ThreadPoolExecutor` se la tua GPU può gestire più stream. + +--- + +## Considerazioni finali + +Eseguire OCR in batch non deve essere un compito gravoso. Sfruttando Aspose OCR insieme a un modello AI leggero, ottieni una **soluzione tutto‑in‑uno** che **estrae testo dalle immagini**, **applica il controllo ortografico**, **corregge gli errori OCR** e **libera le risorse AI** in modo pulito. Prova lo script su una cartella di test, regola il conteggio dei layer GPU per adattarlo al tuo hardware, e avrai una pipeline pronta per la produzione in pochi minuti. + +Hai domande su come modificare il modello, gestire i PDF o integrare tutto in un servizio web? Lascia un commento qui sotto o contattami su GitHub. Buona programmazione, e che il tuo OCR sia sempre preciso! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/italian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/italian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..15eda0081 --- /dev/null +++ b/ocr/italian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,299 @@ +--- +category: general +date: 2026-05-03 +description: Tutorial Python OCR che mostra come caricare file immagine PNG, riconoscere + il testo dall'immagine e risorse AI gratuite per l'elaborazione OCR in batch. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: it +og_description: Il tutorial OCR in Python ti guida nel caricamento di immagini PNG, + nel riconoscimento del testo dall’immagine e nella gestione delle risorse AI gratuite + per l’elaborazione OCR batch. +og_title: Tutorial OCR Python – OCR batch veloce con risorse AI gratuite +tags: +- OCR +- Python +- AI +title: Tutorial Python OCR – Elaborazione OCR batch semplificata +url: /it/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Tutorial Python OCR – Elaborazione OCR in Batch Semplificata + +Hai mai avuto bisogno di un **python ocr tutorial** che ti permetta davvero di eseguire OCR su decine di file PNG senza impazzire? Non sei solo. In molti progetti reali devi **load png image** file, alimentarli a un motore, e poi pulire le risorse AI quando hai finito. + +In questa guida percorreremo un esempio completo, pronto‑da‑eseguire, che mostra esattamente come **recognize text from image** file, elaborarli in batch e liberare la memoria AI sottostante. Alla fine avrai uno script autonomo che potrai inserire in qualsiasi progetto—senza fronzoli, solo l’essenziale. + +## Cosa ti servirà + +- Python 3.10 o versioni successive (la sintassi usata qui si basa su f‑strings e type hints) +- Una libreria OCR che esponga un metodo `engine.recognize` – per scopi dimostrativi assumiamo un pacchetto fittizio `aocr`, ma puoi sostituirlo con Tesseract, EasyOCR, ecc. +- Il modulo helper `ai` mostrato nello snippet di codice (gestisce l’inizializzazione del modello e la pulizia delle risorse) +- Una cartella piena di file PNG che desideri processare + +Se non hai `aocr` o `ai` installati, puoi simularli con stub—vedi la sezione “Stub Opzionali” alla fine. + +## Step 1: Inizializza il Motore AI (Free AI Resources) + +Prima di alimentare qualsiasi immagine nella pipeline OCR, il modello sottostante deve essere pronto. Inizializzare una sola volta salva memoria e velocizza i lavori batch. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Perché è importante:** +Chiamare `ai.initialize` ripetutamente per ogni immagine allocerebbe memoria GPU più e più volte, finendo per far crashare lo script. Controllando `ai.is_initialized()` garantiamo un’unica allocazione – è il principio del “free AI resources”. + +## Step 2: Carica i File PNG per l'Elaborazione OCR in Batch + +Ora raccogliamo tutti i file PNG che vogliamo far passare attraverso l’OCR. Usare `pathlib` mantiene il codice indipendente dal sistema operativo. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Caso limite:** +Se la cartella contiene file non‑PNG (ad es., JPEG) verranno ignorati, evitando che `engine.recognize` si blocchi su un formato non supportato. + +## Step 3: Esegui OCR su Ogni Immagine e Applica il Post‑Processing + +Con il motore pronto e la lista di file preparata, possiamo iterare sulle immagini, estrarre il testo grezzo e passarlo a un post‑processore che pulisce gli artefatti OCR comuni (come interruzioni di riga indesiderate). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Perché separiamo il caricamento dal riconoscimento:** +`aocr.Image.load` può eseguire una decodifica lazy, più veloce per grandi batch. Tenere esplicito il passaggio di caricamento rende anche più semplice sostituire la libreria immagine se in seguito dovrai gestire JPEG o TIFF. + +## Step 4: Pulizia – Free AI Resources Dopo il Batch + +Una volta terminato il batch, dobbiamo rilasciare il modello per evitare perdite di memoria, specialmente su macchine con GPU. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Mettiamo Tutto Insieme – Lo Script Completo + +Di seguito trovi un unico file che unisce i quattro passaggi in un flusso di lavoro coerente. Salvalo come `batch_ocr.py` ed eseguilo da riga di comando. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Output Atteso + +Eseguendo lo script su una cartella contenente tre PNG potrebbe stampare: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +Il file `ocr_results.txt` conterrà un delimitatore chiaro per ogni immagine seguito dal testo OCR pulito. + +## Stub Opzionali per aocr & ai (Se Non Hai Pacchetti Reali) + +Se vuoi solo testare il flusso senza importare librerie OCR pesanti, puoi creare moduli mock minimi: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Posiziona queste cartelle accanto a `batch_ocr.py` e lo script verrà eseguito, stampando risultati mock. + +## Consigli Pro & Trappole Comuni + +- **Picchi di memoria:** Se elabori migliaia di PNG ad alta risoluzione, considera di ridimensionarli prima dell’OCR. `aocr.Image.load` accetta spesso un argomento `max_size`. +- **Gestione Unicode:** Apri sempre il file di output con `encoding="utf-8"`; i motori OCR possono emettere caratteri non‑ASCII. +- **Parallelismo:** Per OCR CPU‑bound puoi avvolgere `ocr_batch` in un `concurrent.futures.ThreadPoolExecutor`. Ricorda però di mantenere una singola istanza `ai` – avviare molti thread che chiamano tutti `ai.initialize` vanifica l’obiettivo del “free AI resources”. +- **Resilienza agli errori:** Avvolgi il ciclo per immagine in un blocco `try/except` così un singolo PNG corrotto non interromperà l’intero batch. + +## Conclusione + +Ora hai un **python ocr tutorial** che dimostra come **load png image** file, eseguire **batch OCR processing** e gestire responsabilmente **free AI resources**. L’esempio completo e funzionante mostra esattamente come **recognize text from image** oggetti e pulire le risorse a fine operazione, così puoi copiarlo e incollarlo nei tuoi progetti senza cercare pezzi mancanti. + +Pronto per il passo successivo? Prova a sostituire i moduli stub `aocr` e `ai` con librerie reali come `pytesseract` e `torchvision`. Puoi anche estendere lo script per generare JSON, inviare i risultati a un database o integrarlo con un bucket di storage cloud. Il cielo è il limite—buon coding! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/italian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/italian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..cd0708749 --- /dev/null +++ b/ocr/italian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,255 @@ +--- +category: general +date: 2026-05-03 +description: Scopri come eseguire l'OCR su un'immagine ed estrarre il testo con le + coordinate utilizzando il riconoscimento OCR strutturato. Codice Python passo‑passo + incluso. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: it +og_description: Esegui OCR su un'immagine e ottieni il testo con le coordinate usando + il riconoscimento OCR strutturato. Esempio completo in Python con spiegazioni. +og_title: Esegui OCR sull'immagine – Tutorial di estrazione di testo strutturato +tags: +- OCR +- Python +- Computer Vision +title: Esegui OCR su immagine – Guida completa all’estrazione di testo strutturato +url: /it/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Esegui OCR su immagine – Guida completa all'estrazione di testo strutturato + +Ti è mai capitato di dover **run OCR on image** ma non sapevi come mantenere le posizioni esatte di ogni parola? Non sei l'unico. In molti progetti—scansione di ricevute, digitalizzazione di moduli o test UI—hai bisogno non solo del testo grezzo ma anche dei riquadri che indicano dove si trova ogni riga nell'immagine. + +Questo tutorial ti mostra un modo pratico per *run OCR on image* usando il motore **aocr**, richiedere il **structured OCR recognition** e poi post‑processare il risultato preservando la geometria. Alla fine sarai in grado di **extract text with coordinates** in poche righe di Python e comprenderai perché la modalità strutturata è importante per i task successivi. + +## Cosa imparerai + +- Come inizializzare il motore OCR per **structured OCR recognition**. +- Come fornire un'immagine e ricevere risultati grezzi che includono i limiti delle linee. +- Come eseguire un post‑processore che pulisce il testo senza perdere la geometria. +- Come iterare sulle linee finali e stampare ogni pezzo di testo insieme al suo bounding box. + +Nessuna magia, nessun passaggio nascosto—solo un esempio completo e funzionante che puoi inserire nel tuo progetto. + +--- + +## Prerequisiti + +Prima di immergerci, assicurati di avere installato quanto segue: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Avrai inoltre bisogno di un file immagine (`input_image.png` o `.jpg`) che contenga testo chiaro e leggibile. Qualsiasi cosa, da una fattura scannerizzata a uno screenshot, va bene, purché il motore OCR riesca a vedere i caratteri. + +--- + +## Passo 1: Inizializzare il motore OCR per il riconoscimento strutturato + +La prima cosa che facciamo è creare un'istanza di `aocr.Engine()` e indicare che vogliamo **structured OCR recognition**. La modalità strutturata restituisce non solo il testo semplice ma anche dati geometrici (rettangoli di delimitazione) per ogni riga, essenziali quando devi mappare il testo sull'immagine. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Perché è importante:** +> Nella modalità predefinita il motore potrebbe darti solo una stringa di parole concatenate. La modalità strutturata ti fornisce una gerarchia di pagine → linee → parole, ciascuna con coordinate, rendendo molto più semplice sovrapporre i risultati sull'immagine originale o passarli a un modello sensibile al layout. + +--- + +## Passo 2: Eseguire OCR sull'immagine e ottenere i risultati grezzi + +Ora forniamo l'immagine al motore. La chiamata `recognize` restituisce un oggetto `OcrResult` che contiene una collezione di linee, ognuna con il proprio rettangolo di delimitazione. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +A questo punto `raw_result.lines` contiene oggetti con due attributi importanti: + +- `text` – la stringa riconosciuta per quella linea. +- `bounds` – una tupla del tipo `(x, y, width, height)` che descrive la posizione della linea. + +--- + +## Passo 3: Post‑processare preservando la geometria + +L'output OCR grezzo è spesso rumoroso: caratteri erranti, spazi fuori posto o problemi di interruzione di riga. La funzione `ai.run_postprocessor` pulisce il testo ma **mantiene intatta la geometria originale**, così hai ancora coordinate accurate. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Consiglio esperto:** Se disponi di vocabolari specifici del dominio (ad es., codici prodotto), fornisci un dizionario personalizzato al post‑processore per migliorare la precisione. + +--- + +## Passo 4: Estrarre testo con coordinate – iterare e visualizzare + +Infine, cicliamo sulle linee pulite, stampando il bounding box di ciascuna insieme al suo testo. Questo è il cuore di **extract text with coordinates**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Output previsto + +Supponendo che l'immagine di input contenga due linee: “Invoice #12345” e “Total: $89.99”, vedrai qualcosa del genere: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +La prima tupla è il `(x, y, width, height)` della linea sull'immagine originale, permettendoti di disegnare rettangoli, evidenziare il testo o passare le coordinate a un altro sistema. + +--- + +## Visualizzare il risultato (opzionale) + +Se vuoi vedere i bounding box sovrapposti all'immagine, puoi usare Pillow (PIL) per disegnare i rettangoli. Di seguito trovi un breve snippet; sentiti libero di saltarlo se ti servono solo i dati grezzi. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image example showing bounding boxes](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +Il testo alternativo sopra contiene la **primary keyword**, soddisfacendo il requisito SEO per gli attributi alt delle immagini. + +--- + +## Perché il riconoscimento OCR strutturato supera la semplice estrazione di testo + +Ti potresti chiedere: “Non posso semplicemente eseguire OCR e ottenere il testo? Perché preoccuparsi della geometria?” + +- **Contesto spaziale:** Quando devi mappare i campi su un modulo (ad es., “Date” accanto al valore della data), le coordinate ti dicono *dove* si trova il dato. +- **Layout a più colonne:** Il testo lineare semplice perde l'ordine; i dati strutturati preservano l'ordine delle colonne. +- **Precisione del post‑processing:** Conoscere le dimensioni del box ti aiuta a decidere se una parola è un'intestazione, una nota a piè di pagina o un artefatto errante. + +In sintesi, **structured OCR recognition** ti offre la flessibilità per costruire pipeline più intelligenti—che tu stia inserendo dati in un database, creando PDF ricercabili o addestrando un modello di machine‑learning che rispetti il layout. + +--- + +## Casi limite comuni e come gestirli + +| Situazione | Cosa controllare | Correzione suggerita | +|-----------|-------------------|---------------| +| **Immagini ruotate o distorte** | I bounding box potrebbero essere fuori asse. | Pre‑processare con deskewing (ad es., `warpAffine` di OpenCV). | +| **Font molto piccoli** | Il motore può perdere caratteri, generando linee vuote. | Aumentare la risoluzione dell'immagine o usare `ocr_engine.set_dpi(300)`. | +| **Lingue miste** | Un modello linguistico errato può produrre testo incomprensibile. | Impostare `ocr_engine.language = ["en", "de"]` prima del riconoscimento. | +| **Box sovrapposti** | Il post‑processore potrebbe unire due linee involontariamente. | Verificare `line.bounds` dopo il processing; regolare le soglie in `ai.run_postprocessor`. | + +Affrontare questi scenari fin dall'inizio ti farà risparmiare mal di testa in seguito, soprattutto quando scala la soluzione a centinaia di documenti al giorno. + +--- + +## Script completo end‑to‑end + +Di seguito trovi il programma completo, pronto per l'esecuzione, che collega tutti i passaggi. Copia‑incolla, aggiusta il percorso dell'immagine e sei pronto. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Eseguire questo script: + +1. **Run OCR on image** in modalità strutturata. +2. **Extract text with coordinates** per ogni linea. +3. Opzionalmente produce un PNG annotato che mostra i box. + +--- + +## Conclusione + +Ora disponi di una soluzione solida e autonoma per **run OCR on image** e **extract text with coordinates** usando **structured OCR recognition**. Il codice dimostra ogni passaggio—dall'inizializzazione del motore al post‑processing e alla verifica visiva—così puoi adattarlo a ricevute, moduli o qualsiasi documento visivo che richieda una localizzazione precisa del testo. + +Qual è il prossimo passo? Prova a sostituire il motore `aocr` con un'altra libreria (Tesseract, EasyOCR) e osserva come differiscono le uscite strutturate. Sperimenta diverse strategie di post‑processing, come il controllo ortografico o filtri regex personalizzati, per aumentare la precisione nel tuo dominio. E se costruisci una pipeline più ampia, considera di memorizzare le coppie `(text, bounds)` in un database per analisi future. + +Buon coding, e che i tuoi progetti OCR siano sempre precisi! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/japanese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/japanese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..2c423fd8f --- /dev/null +++ b/ocr/japanese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,228 @@ +--- +category: general +date: 2026-05-03 +description: Aspose OCR と AI スペルチェックを使用して画像からテキストを抽出します。画像の OCR 方法、OCR 用の画像の読み込み、請求書からのテキスト認識、GPU + リソースの解放方法を学びましょう。 +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: ja +og_description: Aspose OCR と AI スペルチェックで画像からテキストを抽出する。OCR 画像の方法、OCR 用の画像の読み込み、GPU + リソースの解放についてステップバイステップで解説。 +og_title: 画像からテキストを抽出 – 完全OCR&スペルチェックガイド +tags: +- OCR +- Aspose +- AI +- Python +title: 画像からテキストを抽出 – Aspose AI スペルチェックによる OCR +url: /ja/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# 画像からテキストを抽出 – 完全OCR&スペルチェックガイド + +**画像からテキストを抽出**したいことはありますか、しかしどのライブラリが速度と精度の両方を提供するか分からなかったことはありませんか? あなただけではありません。実際のプロジェクトでは、請求書処理、レシートのデジタル化、契約書のスキャンなど、画像からクリーンで検索可能なテキストを取得することが最初のハードルです。 + +良いニュースは、Aspose OCR と軽量な Aspose AI モデルを組み合わせることで、数行の Python でその作業を処理できることです。このチュートリアルでは **画像を OCR する方法** を順に説明し、画像を正しく読み込み、組み込みのスペルチェックポストプロセッサを実行し、最後に **GPU リソースを解放** してアプリのメモリ使用を抑える方法を紹介します。 + +このガイドの最後までに、**請求書からテキストを認識**できるようになり、一般的な OCR の誤りを自動的に修正し、次のバッチのために GPU をクリーンに保つことができます。 + +--- + +## 必要なもの + +- Python 3.9 以上(コードは型ヒントを使用していますが、以前の 3.x バージョンでも動作します) +- `aspose-ocr` と `aspose-ai` パッケージ(`pip install aspose-ocr aspose-ai` でインストール) +- CUDA 対応 GPU はオプションです。GPU が見つからない場合はスクリプトが CPU にフォールバックします。 +- 例として `sample_invoice.png` のような画像を、参照できるフォルダーに配置します。 + +重い機械学習フレームワークや大容量モデルのダウンロードは不要です—ほとんどの GPU に快適に収まる小さな Q4‑K‑M 量子化モデルだけです。 + +--- + +## ステップ 1: OCR エンジンの初期化 – 画像からテキストを抽出 + +最初に行うのは `OcrEngine` インスタンスを作成し、期待する言語を指定することです。ここでは英語を選択し、プレーンテキスト出力を要求します。これは下流の処理に最適です。 + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**この設定が重要な理由:** 言語を設定することで文字セットが絞られ、精度が向上します。プレーンテキストモードは、画像からテキストを抽出したいだけのときに通常不要なレイアウト情報を除去します。 + +--- + +## ステップ 2: OCR 用に画像をロード – 画像を OCR する方法 + +ここでエンジンに実際の画像を渡します。`Image.load` ヘルパーは一般的なフォーマット(PNG、JPEG、TIFF)を理解し、ファイル I/O の細かい違いを抽象化します。 + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**ヒント:** ソース画像が大きい場合は、エンジンに送る前にリサイズすることを検討してください。サイズを小さくすると、認識品質を損なうことなく GPU メモリ使用量を削減できます。 + +--- + +## ステップ 3: Aspose AI モデルの構成 – 請求書からテキストを認識 + +Aspose AI には自動ダウンロード可能な小さな GGUF モデルが同梱されています。例では `Qwen2.5‑3B‑Instruct‑GGUF` リポジトリを使用し、`q4_k_m` に量子化しています。また、ランタイムに GPU 上で 20 層を割り当てるよう指示し、速度と VRAM 使用量のバランスを取ります。 + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**内部的には:** 量子化モデルはディスク上で約 1.5 GB で、フルプレシジョンモデルのごく一部です。それでも、典型的な OCR の綴りミスを検出できるほどの言語的ニュアンスを保持しています。 + +--- + +## ステップ 4: AsposeAI の初期化とスペルチェックポストプロセッサの添付 + +Aspose AI には既製のスペルチェックポストプロセッサが含まれています。これを添付することで、すべての OCR 結果が自動的にクリーンアップされます。 + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**ポストプロセッサを使用する理由:** OCR エンジンはしばしば “Invoice” を “Invo1ce”、 “Total” を “T0tal” と誤認識します。スペルチェックは軽量言語モデルを生文字列に適用し、カスタム辞書を作成せずにこれらのエラーを修正します。 + +--- + +## ステップ 5: OCR 結果にスペルチェックポストプロセッサを実行 + +すべてが接続された状態で、1 回の呼び出しで修正されたテキストが得られます。また、元のテキストとクリーンアップ後のテキストの両方を出力し、改善を確認できるようにします。 + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +請求書の典型的な出力は次のようになります: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +“Invo1ce” が正しい単語 “Invoice” に変換されていることに注目してください。これが組み込み AI スペルチェックの力です。 + +--- + +## ステップ 6: GPU リソースの解放 – GPU リソースを安全に解放 + +長時間稼働するサービス(例: 1 分で数十件の請求書を処理する Web API)で実行する場合、各バッチ後に GPU コンテキストを解放する必要があります。そうしないとメモリリークが発生し、最終的に “CUDA out of memory” エラーになります。 + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**プロのコツ:** 例外が発生した場合でも必ず実行されるよう、`finally` ブロックまたはコンテキストマネージャ内で `free_resources()` を呼び出してください。 + +--- + +## 完全な動作例 + +すべての要素を組み合わせると、任意のプロジェクトに組み込める自己完結型スクリプトが得られます。 + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +ファイルを保存し、画像へのパスを調整して `python extract_text_from_image.py` を実行してください。コンソールにクリーンアップされた請求書テキストが表示されるはずです。 + +--- + +## よくある質問 (FAQ) + +**Q: CPU のみのマシンでも動作しますか?** +A: もちろんです。GPU が検出されない場合、Aspose AI は CPU 実行にフォールバックしますが、速度は遅くなります。`model_cfg.gpu_layers = 0` を設定すれば CPU を強制できます。 + +**Q: 請求書が英語以外の言語の場合はどうすればよいですか?** +A: `ocr_engine.language` を適切な enum 値に変更してください(例: `aocr.Language.Spanish`)。スペルチェックモデルは多言語対応ですが、言語固有のモデルを使用した方がより良い結果が得られることがあります。 + +**Q: 複数の画像をループで処理できますか?** +A: はい。ロード、認識、ポストプロセッシングのステップを `for` ループ内に移動すれば可能です。同じ AI インスタンスを再利用する場合は、ループ後または各バッチ後に `ocr_ai.free_resources()` を呼び出すことを忘れないでください。 + +**Q: モデルのダウンロードサイズはどれくらいですか?** +A: 量子化された `q4_k_m` バージョンで約 1.5 GB です。最初の実行後にキャッシュされるため、以降の実行は瞬時です。 + +--- + +## 結論 + +このチュートリアルでは Aspose OCR を使用して **画像からテキストを抽出** する方法、小さな AI モデルの構成、スペルチェックポストプロセッサの適用、そして安全に **GPU リソースを解放** する方法を示しました。このワークフローは画像のロードから後始末までを網羅し、**請求書からテキストを認識**するシナリオに信頼できるパイプラインを提供します。 + +次のステップは? スペルチェックをカスタムエンティティ抽出モデルに置き換えてみてください + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/japanese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/japanese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..92afa97da --- /dev/null +++ b/ocr/japanese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,213 @@ +--- +category: general +date: 2026-05-03 +description: Aspose OCR と AI スペルチェックを使用して画像をバッチ OCR する方法。画像からテキストを抽出し、スペルチェックを適用し、AI + リソースを無料で利用して OCR エラーを修正する方法を学びましょう。 +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: ja +og_description: Aspose OCR と AI スペルチェックを使用して画像をバッチ OCR する方法。画像からテキストを抽出し、スペルチェックを適用し、AI + リソースを無料で利用し、OCR エラーを修正するステップバイステップのガイドをご覧ください。 +og_title: Aspose OCRでバッチOCRを行う方法 – 完全なPythonチュートリアル +tags: +- OCR +- Python +- AI +- Aspose +title: Aspose OCRでバッチOCRを行う方法 – 完全Pythonガイド +url: /ja/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Aspose OCRでバッチOCRを行う方法 – 完全Pythonガイド + +スキャンしたPDFや写真のフォルダ全体を、ファイルごとに別々のスクリプトを書かずに **how to batch OCR** できるか、考えたことはありませんか? あなただけではありません。実際のパイプラインでは **画像からテキストを抽出** し、スペルミスを修正し、最後に割り当てたAIリソースを解放する必要があります。このチュートリアルでは、Aspose OCR(軽量AIポストプロセッサ)と数行のPythonでそれを実現する方法を正確に示します。 + +OCRエンジンの初期化、AIスペルチェッカーの接続、画像ディレクトリのループ処理、そしてモデルのクリーンアップまで順を追って説明します。最後まで実行すれば、**OCRエラーを自動的に修正** し、**AIリソースを解放** してGPUを快適に保つ、すぐに使えるスクリプトが手に入ります。 + +## 必要なもの + +- Python 3.9+(コードは型ヒントを使用していますが、以前の3.xバージョンでも動作します) +- `asposeocr` パッケージ(`pip install asposeocr`) – OCRエンジンを提供します。 +- Hugging Face モデル `bartowski/Qwen2.5-3B-Instruct-GGUF` へのアクセス(自動的にダウンロードされます)。 +- 少なくとも数GBのVRAMを持つGPU(スクリプトは `gpu_layers = 30` を設定していますが、必要に応じて下げられます)。 + +外部サービスや有料APIは不要ですべてローカルで実行できます。 + +--- + +## ステップ1: OCRエンジンの設定 – **how to batch OCR** を効率的に + +大量の画像を処理する前に、堅牢なOCRエンジンが必要です。Aspose OCR は、1回の呼び出しで言語と認識モードを選択できます。 + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Why this matters:** `recognize_mode` を `Plain` に設定すると出力が軽量になり、後でスペルチェックを実行する際に最適です。レイアウト情報が必要な場合は `Layout` に切り替えますが、バッチジョブでは余計なオーバーヘッドになる可能性があります。 + +> **Pro tip:** 多言語スキャンを扱う場合は、`ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]` のようにリストで指定できます。 + +--- + +## ステップ2: AIポストプロセッサの初期化 – OCR出力に **Apply Spell Check** を適用 + +Aspose AI には、任意のモデルを実行できる組み込みのポストプロセッサが付属しています。ここでは、Hugging Face から量子化された Qwen 2.5 モデルを取得し、スペルチェックルーチンにフックします。 + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Why this matters:** モデルは量子化(`q4_k_m`)されており、メモリ使用量を大幅に削減しつつ十分な言語理解を提供します。`set_post_processor` を呼び出すことで、任意の文字列に対して **apply spell check** ステップを自動的に実行するよう Aspose AI に指示します。 + +> **Watch out:** GPU が 30 層に対応できない場合は、数値を 15 や 5 に下げても動作しますが、処理はやや遅くなります。 + +--- + +## ステップ3: OCRを実行し、単一画像で **Correct OCR Errors** を行う + +OCRエンジンとAIスペルチェッカーの両方が準備できたので、これらを組み合わせます。この関数は画像を読み込み、生のテキストを抽出し、AIポストプロセッサでクリーンアップします。 + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Why this matters:** 生のOCR文字列を直接AIモデルに渡すことで、正規表現やカスタム辞書を書かずに **correct OCR errors** のパスが実現できます。モデルは文脈を理解しているため、`recieve` → `receive` のような誤りや、より微妙なミスも修正できます。 + +--- + +## ステップ4: **画像からテキストを抽出** を一括で – 実際のバッチループ + +ここで **how to batch OCR** の真価が発揮されます。ディレクトリを走査し、サポート外のファイルをスキップし、各修正済み出力を `.txt` ファイルに書き込みます。 + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### 期待される出力 + +画像に文 *“The quick brown fox jumps over the lazzy dog.”* が含まれている場合、次のようなテキストファイルが生成されます。 + +``` +The quick brown fox jumps over the lazy dog. +``` + +二重の “z” が自動的に修正されていることに注目してください – これがAIスペルチェックの効果です。 + +**Why this matters:** OCR と AI オブジェクトを **一度だけ** 作成して再利用することで、各ファイルごとにモデルをロードするオーバーヘッドを回避できます。これがスケールで **how to batch OCR** を行う最も効率的な方法です。 + +--- + +## ステップ5: クリーンアップ – **Free AI Resources** を正しく行う + +作業が完了したら `free_resources()` を呼び出して GPU メモリ、CUDA コンテキスト、モデルが作成した一時ファイルを解放します。 + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +このステップを省略すると、GPU の割り当てが残り続け、後続の Python プロセスがクラッシュしたり VRAM を食い尽くしたりする可能性があります。バッチジョブの「電源オフ」部分と考えてください。 + +--- + +## よくある落とし穴と追加ヒント + +| 問題 | 確認すべき点 | 対策 | +|------|--------------|------| +| **Out‑of‑memory errors** | GPU が数十枚の画像処理後にメモリ不足になる | `gpu_layers` を減らすか、CPU に切り替える(`model_cfg.gpu_layers = 0`)。 | +| **Missing language pack** | OCR が空文字列を返す | `asposeocr` バージョンに英語言語データが含まれているか確認し、必要なら再インストール。 | +| **Non‑image files** | `.pdf` などの余計なファイルでスクリプトがクラッシュ | `if not file_name.lower().endswith(...)` ガードで既にスキップされています。 | +| **Spell‑check not applied** | 出力が生の OCR と同じになる | ループ前に `ai_processor.set_post_processor` が呼び出されたか確認。 | +| **Slow batch speed** | 1枚あたり 5 秒以上かかる | 初回実行後に `model_cfg.allow_auto_download = "false"` を有効にし、モデルの再ダウンロードを防止。 | + +**Pro tip:** 英語以外の言語で **画像からテキストを抽出** したい場合は、`ocr_engine.language` を該当する enum(例: `aocr.Language.French`)に変更してください。同じ AI ポストプロセッサは引き続きスペルチェックを適用しますが、ベストな結果を得るには言語固有のモデルを使用することをおすすめします。 + +--- + +## まとめと次のステップ + +**how to batch OCR** の全パイプラインをカバーしました: + +1. 英語向けのプレーンテキスト OCR エンジンを **初期化**。 +2. AI スペルチェックモデルを設定し、ポストプロセッサとしてバインド。 +3. 各画像で OCR を実行し、AI が **OCRエラーを自動修正**。 +4. ディレクトリをループして **画像からテキストを一括抽出**。 +5. ジョブ完了後に **AIリソースを解放**。 + +ここからは次のような活用が考えられます: + +- 修正済みテキストを下流の NLP パイプライン(感情分析、エンティティ抽出など)に流す。 +- `ai_processor.set_post_processor(your_custom_func, {})` を呼び出すことで、スペルチェックの代わりにカスタム要約器に差し替える。 +- GPU が複数ストリームに対応できる場合は、`concurrent.futures.ThreadPoolExecutor` を使ってフォルダループを並列化する。 + +--- + +## 最後に + +バッチOCRは面倒な作業である必要はありません。Aspose OCR と軽量AIモデルを組み合わせることで、**画像からテキストを抽出**、**スペルチェックを適用**、**OCRエラーを修正**、そして **AIリソースをきれいに解放** するワンストップソリューションが手に入ります。テストフォルダでスクリプトを試し、ハードウェアに合わせて GPU レイヤー数を調整すれば、数分で本番レベルのパイプラインが完成します。 + +モデルの調整や PDF の取り扱い、Web サービスへの統合について質問がありますか? コメントを残すか、GitHub で ping してください。コーディングを楽しんで、正確な OCR を手に入れましょう! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/japanese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/japanese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..86495782c --- /dev/null +++ b/ocr/japanese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,296 @@ +--- +category: general +date: 2026-05-03 +description: PNG画像ファイルの読み込み方法、画像からの文字認識、バッチOCR処理のための無料AIリソースを紹介するPython OCRチュートリアル +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: ja +og_description: Python OCRチュートリアルでは、PNG画像の読み込み、画像からの文字認識、バッチOCR処理のための無料AIリソースの活用方法を順を追って解説します。 +og_title: Python OCRチュートリアル – 無料AIリソースで手軽にバッチOCR +tags: +- OCR +- Python +- AI +title: Python OCRチュートリアル – バッチOCR処理を簡単に +url: /ja/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR チュートリアル – バッチ OCR 処理を簡単に + +実際に何十枚もの PNG ファイルに対して OCR を実行でき、頭を抱えることなく使える **python ocr tutorial** が欲しかったことはありませんか? あなたは一人ではありません。多くの実務プロジェクトでは **load png image** ファイルを読み込み、エンジンに渡し、作業が終わったら AI リソースをクリーンアップする必要があります。 + +このガイドでは、**recognize text from image** ファイルを正確に認識し、バッチ処理し、基盤となる AI メモリを解放する完全で実行可能な例を順に解説します。最後まで読むと、余計なものはなく、必要な要素だけが揃った、どのプロジェクトにも組み込める自己完結型スクリプトが手に入ります。 + +## 必要なもの + +- Python 3.10 以上(ここで使用している構文は f‑strings と型ヒントに依存しています) +- `engine.recognize` メソッドを提供する OCR ライブラリ – デモ用には架空の `aocr` パッケージを想定していますが、Tesseract や EasyOCR などに置き換えることができます +- `ai` ヘルパーモジュール(コードスニペットに示されているもの)。モデルの初期化とリソースのクリーンアップを処理します +- 処理したい PNG ファイルが入ったフォルダー + +`aocr` や `ai` がインストールされていない場合は、スタブで模倣できます – 終わり近くの “Optional Stubs” セクションをご覧ください。 + +## 手順 1: AI エンジンの初期化(AI リソースの解放) + +OCR パイプラインに画像を投入する前に、基盤となるモデルが準備できている必要があります。1 回だけ初期化することでメモリを節約し、バッチジョブの速度が向上します。 + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**なぜ重要か:** +`ai.initialize` を画像ごとに繰り返し呼び出すと GPU メモリが何度も割り当てられ、最終的にスクリプトがクラッシュします。`ai.is_initialized()` を確認することで、1 回だけの割り当てを保証します – これが “free AI resources” の原則です。 + +## 手順 2: バッチ OCR 処理のために PNG 画像ファイルをロードする + +ここでは、OCR にかけたいすべての PNG ファイルを集めます。`pathlib` を使用することでコードが OS に依存しません。 + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**エッジケース:** +フォルダーに PNG 以外のファイル(例: JPEG)が含まれている場合は無視され、`engine.recognize` がサポートされていない形式でエラーになるのを防ぎます。 + +## 手順 3: 各画像で OCR を実行し、ポストプロセッシングを適用する + +エンジンが準備でき、ファイルリストが整ったら、画像をループし、生テキストを抽出し、一般的な OCR アーティファクト(余分な改行など)をクリーンアップするポストプロセッサに渡すことができます。 + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**ロードと認識を分離する理由:** +`aocr.Image.load` は遅延デコードを行う可能性があり、大規模バッチでは高速です。ロードステップを明示的に保つことで、後で JPEG や TIFF ファイルを扱う必要が出た場合に別の画像ライブラリに簡単に置き換えられます。 + +## 手順 4: バッチ終了後のクリーンアップ – AI リソースの解放 + +バッチが完了したら、メモリリークを防ぐためにモデルを解放する必要があります。特に GPU 対応マシンでは重要です。 + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## 全体をまとめる – 完全なスクリプト + +以下は、4 つの手順を統合した単一ファイルです。`batch_ocr.py` として保存し、コマンドラインから実行してください。 + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### 期待される出力 + +3 枚の PNG が入ったフォルダーに対してスクリプトを実行すると、次のように出力される可能性があります: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +`ocr_results.txt` ファイルには、各画像ごとの明確な区切りと、クリーンアップされた OCR テキストが含まれます。 + +## aocr と ai のオプションスタブ(実際のパッケージがない場合) + +重厚な OCR ライブラリを導入せずにフローだけをテストしたい場合は、最小限のモックモジュールを作成できます: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +これらのフォルダーを `batch_ocr.py` の隣に配置すれば、スクリプトは実行され、モック結果が出力されます。 + +## プロのコツと一般的な落とし穴 + +- **Memory spikes:** 高解像度 PNG を数千枚処理する場合は、OCR 前にリサイズすることを検討してください。`aocr.Image.load` はしばしば `max_size` 引数を受け取ります。 +- **Unicode handling:** 出力ファイルは必ず `encoding="utf-8"` で開いてください;OCR エンジンは非 ASCII 文字を出力することがあります。 +- **Parallelism:** CPU バウンドの OCR では、`ocr_batch` を `concurrent.futures.ThreadPoolExecutor` でラップできます。ただし、`ai` インスタンスは1つだけに保つことを忘れずに – 各スレッドが `ai.initialize` を呼び出すと “free AI resources” の目的に反します。 +- **Error resilience:** 各画像のループを `try/except` ブロックで囲み、1つの破損した PNG がバッチ全体を中断しないようにします。 + +## 結論 + +これで、**python ocr tutorial** として、**load png image** ファイルを読み込み、**batch OCR processing** を実行し、**free AI resources** を適切に管理する方法を示すチュートリアルが手に入りました。完全で実行可能な例は、**recognize text from image** オブジェクトを正確に認識し、後処理でクリーンアップする方法を示しているので、欠けている部品を探すことなく自分のプロジェクトにコピー&ペーストできます。 + +次のステップに進む準備はできましたか? スタブ化した `aocr` と `ai` モジュールを、`pytesseract` や `torchvision` などの実際のライブラリに置き換えてみてください。また、スクリプトを拡張して JSON 出力したり、データベースに結果をプッシュしたり、クラウドストレージバケットと統合したりすることも可能です。可能性は無限です—ハッピーコーディング! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/japanese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/japanese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..fc3132011 --- /dev/null +++ b/ocr/japanese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,252 @@ +--- +category: general +date: 2026-05-03 +description: 構造化OCR認識を用いて画像上でOCRを実行し、座標付きテキストを抽出する方法を学びましょう。ステップバイステップのPythonコードが含まれています。 +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: ja +og_description: 画像に対してOCRを実行し、構造化OCR認識を使用して座標付きテキストを取得します。説明付きの完全なPython例です。 +og_title: 画像でOCRを実行 – 構造化テキスト抽出チュートリアル +tags: +- OCR +- Python +- Computer Vision +title: 画像でOCRを実行する – 構造化テキスト抽出の完全ガイド +url: /ja/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# 画像で OCR を実行する – 構造化テキスト抽出の完全ガイド + +画像ファイルに対して **OCR を実行** したいけれど、各単語の正確な位置を保持する方法が分からない、ということはありませんか?レシートのスキャン、フォームのデジタル化、UI テストなど、多くのプロジェクトで、生のテキストだけでなく、各行が画像上のどこにあるかを示すバウンディングボックスが必要になります。 + +このチュートリアルでは、**aocr** エンジンを使って *画像で OCR を実行* し、**構造化 OCR 認識** をリクエストし、ジオメトリを保持したまま結果を後処理する実用的な方法を紹介します。最後まで読めば、数行の Python コードで **座標付きテキスト抽出** ができるようになり、構造化モードが下流タスクで重要になる理由が理解できるようになります。 + +## 学べること + +- **構造化 OCR 認識** 用に OCR エンジンを初期化する方法 +- 画像を入力して、行のバウンディング情報を含む生データを取得する方法 +- ジオメトリを失わずにテキストをクリーンアップする後処理の実行方法 +- 最終的な行をイテレートし、テキストとバウンディングボックスを一緒に出力する方法 + +マジックや隠された手順はありません。すぐに自分のプロジェクトに組み込める、完全に実行可能なサンプルです。 + +--- + +## 前提条件 + +作業を始める前に、以下がインストールされていることを確認してください。 + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +また、テキストがはっきりと読める画像ファイル(`input_image.png` または `.jpg`)が必要です。スキャンした請求書からスクリーンショットまで、OCR エンジンが文字を認識できるものであれば問題ありません。 + +--- + +## 手順 1: 構造化認識用に OCR エンジンを初期化する + +まず最初に `aocr.Engine()` のインスタンスを作成し、**構造化 OCR 認識** を要求します。構造化モードでは、プレーンテキストだけでなく、各行のバウンディング矩形というジオメトリデータも返されます。これはテキストを画像上にマッピングする際に必須です。 + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **なぜ重要か:** +> デフォルトモードではエンジンが単に連結された文字列を返すだけになることがあります。構造化モードはページ → 行 → 単語という階層構造と座標情報を提供するため、結果を元画像にオーバーレイしたり、レイアウト認識モデルに入力したりするのが格段に楽になります。 + +--- + +## 手順 2: 画像で OCR を実行し、生データを取得する + +画像をエンジンに渡します。`recognize` 呼び出しは `OcrResult` オブジェクトを返し、行ごとのコレクションが含まれます。 + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +この時点で `raw_result.lines` には次の 2 つの重要な属性を持つオブジェクトが格納されています。 + +- `text` – その行で認識された文字列 +- `bounds` – 行の位置を表す `(x, y, width, height)` 形式のタプル + +--- + +## 手順 3: ジオメトリを保持しながら後処理する + +生の OCR 出力はノイズが多いことがあります。余計な文字や誤ったスペース、改行の問題などです。`ai.run_postprocessor` 関数はテキストをクリーンアップしつつ **元のジオメトリを保持** します。これにより、正確な座標情報がそのまま残ります。 + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **プロのコツ:** 製品コードなどドメイン固有の語彙がある場合は、カスタム辞書を後処理に渡すことで精度を向上させられます。 + +--- + +## 手順 4: 座標付きテキストを抽出し、表示する + +最後に、クリーンアップされた行をループし、テキストと一緒にバウンディングボックスを出力します。これが **座標付きテキスト抽出** の核心です。 + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### 期待される出力 + +入力画像に「Invoice #12345」および「Total: $89.99」の 2 行が含まれていると仮定すると、次のような出力が得られます。 + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +最初のタプルは元画像上の行の `(x, y, width, height)` を示しており、矩形を描画したりテキストをハイライトしたり、座標を別システムに渡したりするのに利用できます。 + +--- + +## 結果の可視化(任意) + +バウンディングボックスを画像に重ねて確認したい場合は、Pillow(PIL)を使って矩形を描画できます。以下は簡単なサンプルです。生データだけが必要な場合はスキップして構いません。 + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![画像で OCR を実行した例(バウンディングボックス表示)](/images/ocr-bounding-boxes.png "画像で OCR を実行 – バウンディングボックスのオーバーレイ") + +上記の alt テキストは **主要キーワード** を含んでおり、画像 alt 属性の SEO 要件を満たしています。 + +--- + +## なぜ構造化 OCR 認識が単純テキスト抽出より優れているのか + +「ただ OCR を走らせてテキストだけ取ればいいんじゃない?」と思うかもしれませんが、ジオメトリがあると次のような利点があります。 + +- **空間的コンテキスト:** フォーム上で「日付」フィールドの隣にある日付の値など、データが *どこに* あるかが分かります。 +- **マルチカラムレイアウト:** 直線的なテキストだけでは順序が失われますが、構造化データは列順序を保持します。 +- **後処理精度:** ボックスサイズが分かると、単語が見出しか脚注か、あるいはノイズかを判断しやすくなります。 + +要するに、**構造化 OCR 認識** を使うことで、データベースへの投入、検索可能 PDF の作成、レイアウトを考慮した機械学習モデルの学習など、より賢いパイプラインを構築できる柔軟性が得られます。 + +--- + +## よくあるエッジケースと対処法 + +| 状況 | 注意点 | 推奨対策 | +|-----------|-------------------|---------------| +| **回転または歪んだ画像** | バウンディングボックスが軸外れになる可能性あり | OpenCV の `warpAffine` などでデスキュー前処理を行う | +| **非常に小さいフォント** | エンジンが文字を見逃し、空行になることがある | 画像解像度を上げるか、`ocr_engine.set_dpi(300)` を使用 | +| **混在言語** | 言語モデルが合わず文字化けすることがある | 認識前に `ocr_engine.language = ["en", "de"]` などで言語を設定 | +| **重なり合うボックス** | 後処理で 2 行が誤って結合されることがある | 後処理後に `line.bounds` を確認し、`ai.run_postprocessor` の閾値を調整 | + +これらのシナリオに早めに対処しておくと、1 日に数百件の文書を処理するスケールでも頭痛を防げます。 + +--- + +## 完全なエンドツーエンドスクリプト + +以下は、すべての手順をまとめた実行可能なプログラムです。コピーして画像パスを調整すればすぐに動作します。 + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +このスクリプトを実行すると: + +1. **画像で OCR を実行** し、構造化モードで結果を取得 +2. 各行の **座標付きテキストを抽出** +3. 任意で、バウンディングボックスを描画した PNG を生成 + +--- + +## 結論 + +これで **画像で OCR を実行** し、**構造化 OCR 認識** を用いて **座標付きテキスト抽出** を行うための、自己完結型ソリューションが手に入りました。エンジンの初期化から後処理、可視化までの全工程がコードで示されているので、レシート、フォーム、任意の視覚文書に対して正確なテキスト位置情報を取得できるようになります。 + +次のステップは? `aocr` エンジンを別のライブラリ(Tesseract、EasyOCR など)に置き換えて、構造化出力の違いを比較してみましょう。ドメイン固有のスペルチェックや正規表現フィルタを組み合わせて精度をさらに向上させることも可能です。大規模パイプラインを構築する場合は、`(text, bounds)` ペアをデータベースに保存して後で分析に活用してください。 + +Happy coding, and may your OCR projects be ever accurate! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/korean/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/korean/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..11f5bd863 --- /dev/null +++ b/ocr/korean/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,228 @@ +--- +category: general +date: 2026-05-03 +description: Aspose OCR 및 AI 맞춤법 검사를 사용하여 이미지에서 텍스트를 추출합니다. OCR 이미지 방법, OCR을 위한 이미지 + 로드, 청구서에서 텍스트 인식 및 GPU 리소스 해제에 대해 배워보세요. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: ko +og_description: Aspose OCR 및 AI 맞춤법 검사를 사용하여 이미지에서 텍스트를 추출합니다. 이미지 OCR 방법, OCR을 위한 + 이미지 로드, GPU 리소스 해제 등을 다루는 단계별 가이드. +og_title: 이미지에서 텍스트 추출 – 완전한 OCR 및 맞춤법 검사 가이드 +tags: +- OCR +- Aspose +- AI +- Python +title: 이미지에서 텍스트 추출 – Aspose AI 맞춤법 검사와 OCR +url: /ko/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# 이미지에서 텍스트 추출 – 완전 OCR 및 맞춤법 검사 가이드 + +이미 **이미지에서 텍스트 추출**이 필요했지만 어느 라이브러리가 속도와 정확성을 모두 제공할지 몰랐던 적이 있나요? 여러분만 그런 것이 아닙니다. 실제 프로젝트—예를 들어 청구서 처리, 영수증 디지털화, 계약서 스캔—에서 사진으로부터 깨끗하고 검색 가능한 텍스트를 얻는 것이 첫 번째 관문입니다. + +좋은 소식은 Aspose OCR과 가벼운 Aspose AI 모델을 결합하면 몇 줄의 파이썬 코드만으로 이 작업을 처리할 수 있다는 것입니다. 이번 튜토리얼에서는 **이미지 OCR** 방법, 이미지를 올바르게 로드하는 방법, 내장 맞춤법 검사 후처리를 실행하는 방법, 그리고 **GPU 리소스 해제** 방법을 단계별로 살펴보겠습니다. + +이 가이드를 마치면 **청구서 이미지에서 텍스트 인식**을 수행하고 일반적인 OCR 오류를 자동으로 교정하며, 다음 배치를 위해 GPU를 깔끔하게 유지할 수 있습니다. + +--- + +## 준비물 + +- Python 3.9 이상 (코드에 타입 힌트가 사용되었지만 이전 3.x 버전에서도 동작합니다) +- `aspose-ocr` 및 `aspose-ai` 패키지 (`pip install aspose-ocr aspose-ai` 로 설치) +- CUDA 지원 GPU는 선택 사항이며, GPU가 없을 경우 스크립트가 자동으로 CPU로 전환됩니다. +- 예시 이미지, 예: `sample_invoice.png` 를 참조 가능한 폴더에 배치 + +무거운 ML 프레임워크도, 대용량 모델 다운로드도 필요 없습니다—대부분의 GPU에 편하게 들어가는 작은 Q4‑K‑M 양자화 모델만 있으면 됩니다. + +--- + +## Step 1: Initialise the OCR Engine – extract text from image + +먼저 `OcrEngine` 인스턴스를 생성하고 기대하는 언어를 지정합니다. 여기서는 영어를 선택하고 평문 텍스트 출력을 요청합니다. 이는 후속 처리에 이상적입니다. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**왜 중요한가:** 언어를 지정하면 문자 집합이 제한돼 정확도가 향상됩니다. 평문 모드는 레이아웃 정보를 제거하는데, 이는 단순히 **이미지에서 텍스트 추출**만 원할 때 보통 필요하지 않은 정보입니다. + +--- + +## Step 2: Load image for OCR – how to OCR image + +이제 엔진에 실제 이미지를 전달합니다. `Image.load` 헬퍼는 일반적인 포맷(PNG, JPEG, TIFF)을 이해하고 파일‑IO quirks를 추상화합니다. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**팁:** 원본 이미지가 크다면 엔진에 전달하기 전에 리사이즈를 고려하세요. 작은 해상도는 GPU 메모리 사용량을 줄이면서 인식 품질에 큰 영향을 주지 않습니다. + +--- + +## Step 3: Configure the Aspose AI Model – recognize text from invoice + +Aspose AI는 자동 다운로드 가능한 작은 GGUF 모델을 제공합니다. 예제에서는 `Qwen2.5‑3B‑Instruct‑GGUF` 저장소를 사용하며, `q4_k_m` 로 양자화되었습니다. 또한 런타임에 GPU에 20 레이어를 할당하도록 설정해 속도와 VRAM 사용량의 균형을 맞춥니다. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**내부 동작:** 양자화된 모델은 디스크에 약 1.5 GB 정도이며, 전체 정밀도 모델에 비해 훨씬 작지만 일반적인 OCR 오타를 잡아낼 만큼 충분한 언어적 뉘앙스를 보유하고 있습니다. + +--- + +## Step 4: Initialise AsposeAI and attach the spell‑check post‑processor + +Aspose AI에는 즉시 사용할 수 있는 맞춤법 검사 후처리기가 포함되어 있습니다. 이를 연결하면 모든 OCR 결과가 자동으로 정제됩니다. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**왜 후처리기를 사용할까?** OCR 엔진은 종종 “Invoice”를 “Invo1ce” 혹은 “Total”을 “T0tal” 로 오인식합니다. 맞춤법 검사는 가벼운 언어 모델을 원시 문자열에 적용해 이러한 오류를 사용자 정의 사전 없이 교정합니다. + +--- + +## Step 5: Run the spell‑check post‑processor on the OCR result + +모든 설정이 끝났다면 한 번의 호출로 교정된 텍스트를 얻을 수 있습니다. 원본과 정제된 두 버전을 모두 출력해 개선된 모습을 확인해 보세요. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +청구서에 대한 일반적인 출력 예시: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +“Invo1ce”가 올바른 단어 “Invoice”로 바뀐 것을 확인할 수 있습니다. 이것이 내장 AI 맞춤법 검사의 힘입니다. + +--- + +## Step 6: Release GPU resources – release gpu resources safely + +이 코드를 장시간 실행되는 서비스(예: 분당 수십 개의 청구서를 처리하는 웹 API)에서 사용한다면, 각 배치 후 GPU 컨텍스트를 해제해야 합니다. 그렇지 않으면 메모리 누수가 발생하고 결국 “CUDA out of memory” 오류가 뜹니다. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**프로 팁:** `free_resources()` 를 `finally` 블록이나 컨텍스트 매니저 안에서 호출해 예외가 발생해도 항상 실행되도록 하세요. + +--- + +## Full Working Example + +모든 파트를 하나로 합치면 어느 프로젝트에든 넣을 수 있는 독립 실행형 스크립트가 완성됩니다. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +파일을 저장하고 이미지 경로를 조정한 뒤 `python extract_text_from_image.py` 를 실행하세요. 정제된 청구서 텍스트가 콘솔에 출력될 것입니다. + +--- + +## Frequently Asked Questions (FAQ) + +**Q: CPU 전용 머신에서도 작동하나요?** +A: 네. GPU가 감지되지 않으면 Aspose AI가 자동으로 CPU 실행으로 전환되지만 속도는 느려집니다. `model_cfg.gpu_layers = 0` 으로 강제로 CPU만 사용하도록 설정할 수도 있습니다. + +**Q: 청구서가 영어가 아닌 다른 언어라면 어떻게 하나요?** +A: `ocr_engine.language` 를 해당 언어에 맞는 enum 값(e.g., `aocr.Language.Spanish`) 으로 변경하세요. 맞춤법 검사 모델은 다국어를 지원하지만, 언어별 모델을 사용하면 더 좋은 결과를 얻을 수 있습니다. + +**Q: 여러 이미지를 반복문으로 처리할 수 있나요?** +A: 가능합니다. 로드, 인식, 후처리 단계를 `for` 루프 안으로 옮기면 됩니다. 동일한 AI 인스턴스를 재사용한다면 루프 종료 후 혹은 각 배치 후 `ocr_ai.free_resources()` 를 호출하는 것을 잊지 마세요. + +**Q: 모델 다운로드 크기는 얼마나 되나요?** +A: 양자화된 `q4_k_m` 버전은 약 1.5 GB 정도입니다. 최초 실행 시 캐시되며 이후 실행은 즉시 이루어집니다. + +--- + +## Conclusion + +이번 튜토리얼에서는 Aspose OCR을 사용해 **이미지에서 텍스트 추출**하는 방법, 작은 AI 모델을 설정하고, 맞춤법 검사 후처리를 적용하며, **GPU 리소스 해제**까지 안전하게 수행하는 전체 흐름을 보여드렸습니다. 이미지 로드부터 정리까지 모든 과정을 포함한 파이프라인으로 **청구서에서 텍스트 인식** 시나리오에 신뢰할 수 있는 솔루션을 제공합니다. + +다음 단계는 맞춤형 엔터티 추출 모델로 맞춤법 검사를 교체해 보는 것입니다. + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/korean/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/korean/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..ea8d22342 --- /dev/null +++ b/ocr/korean/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,213 @@ +--- +category: general +date: 2026-05-03 +description: Aspose OCR와 AI 맞춤법 검사를 사용하여 이미지를 일괄 OCR하는 방법. 이미지에서 텍스트를 추출하고, 맞춤법 검사를 + 적용하며, 무료 AI 리소스를 활용하고 OCR 오류를 수정하는 방법을 배웁니다. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: ko +og_description: Aspose OCR와 AI 맞춤법 검사를 사용하여 이미지 배치를 OCR하는 방법. 이미지에서 텍스트를 추출하고, 맞춤법 + 검사를 적용하며, AI 리소스를 무료로 활용하고 OCR 오류를 수정하는 단계별 가이드를 따라보세요. +og_title: Aspose OCR으로 배치 OCR 수행하기 – 완전 파이썬 튜토리얼 +tags: +- OCR +- Python +- AI +- Aspose +title: Aspose OCR으로 배치 OCR 수행하기 – 전체 파이썬 가이드 +url: /ko/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Aspose OCR로 배치 OCR 수행하기 – 전체 Python 가이드 + +전체 스캔된 PDF 또는 사진 폴더를 **배치 OCR** 하는 방법을 고민해 본 적 있나요? 별도의 스크립트를 파일마다 작성하지 않아도 됩니다. 실제 파이프라인에서는 **이미지에서 텍스트 추출**하고, 맞춤법 오류를 정리한 뒤 할당한 AI 리소스를 해제해야 할 때가 많습니다. 이 튜토리얼에서는 Aspose OCR(가벼운 AI 후처리기)와 몇 줄의 Python 코드만으로 이를 구현하는 방법을 자세히 보여드립니다. + +OCR 엔진 초기화, AI 맞춤법 검사기 연결, 이미지 디렉터리 순회, 모델 정리까지 단계별로 안내합니다. 최종적으로 **OCR 오류를 자동으로 교정**하고 **AI 리소스를 해제**하여 GPU가 원활히 동작하도록 하는 실행 가능한 스크립트를 얻을 수 있습니다. + +## 준비 사항 + +- Python 3.9+ (코드에 타입 힌트가 사용되지만 이전 3.x 버전에서도 동작합니다) +- `asposeocr` 패키지 (`pip install asposeocr`) – OCR 엔진을 제공합니다. +- Hugging Face 모델 `bartowski/Qwen2.5-3B-Instruct-GGUF` 접근 권한 (자동으로 다운로드됩니다) +- 최소 몇 GB VRAM을 가진 GPU (스크립트는 `gpu_layers = 30`으로 설정되어 있으며, 필요에 따라 낮출 수 있습니다) + +외부 서비스나 유료 API 없이 로컬에서 모두 실행됩니다. + +--- + +## Step 1: OCR 엔진 설정 – **배치 OCR** 효율적으로 수행하기 + +수천 장의 이미지를 처리하기 전에 견고한 OCR 엔진이 필요합니다. Aspose OCR은 한 번의 호출로 언어와 인식 모드를 선택할 수 있습니다. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Why this matters:** `recognize_mode`를 `Plain`으로 설정하면 출력이 가벼워져 나중에 맞춤법 검사를 수행할 때 이상적입니다. 레이아웃 정보가 필요하면 `Layout`으로 전환할 수 있지만, 배치 작업에서는 불필요한 오버헤드가 발생합니다. + +> **Pro tip:** 다국어 스캔을 처리해야 한다면 `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`와 같이 리스트를 전달하면 됩니다. + +--- + +## Step 2: AI 후처리기 초기화 – OCR 출력에 **맞춤법 검사 적용** + +Aspose AI는 원하는 모델을 실행할 수 있는 내장 후처리기를 제공합니다. 여기서는 Hugging Face에서 양자화된 Qwen 2.5 모델을 가져와 맞춤법 검사 루틴에 연결합니다. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Why this matters:** 모델이 양자화(`q4_k_m`)되어 있어 메모리 사용량을 크게 줄이면서도 충분한 언어 이해력을 제공합니다. `set_post_processor`를 호출하면 Aspose AI가 문자열을 전달받을 때마다 **apply spell check** 단계를 자동으로 실행하도록 설정됩니다. + +> **Watch out:** GPU가 30 레이어를 감당하지 못한다면 숫자를 15 또는 5로 낮추세요 – 스크립트는 여전히 동작하지만 약간 느려집니다. + +--- + +## Step 3: 단일 이미지에 OCR 실행 및 **OCR 오류 교정** + +OCR 엔진과 AI 맞춤법 검사가 준비되었으니 이제 이를 결합합니다. 이 함수는 이미지를 로드하고 원시 텍스트를 추출한 뒤 AI 후처리기로 정제합니다. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Why this matters:** 원시 OCR 문자열을 바로 AI 모델에 전달하면 **OCR 오류 교정**을 위한 별도 정규식이나 사전 없이도 교정이 이루어집니다. 모델은 문맥을 이해하므로 “recieve” → “receive”와 같은 미묘한 오류도 수정합니다. + +--- + +## Step 4: **이미지에서 텍스트 추출** 대량 처리 – 실제 배치 루프 + +여기서 **배치 OCR**의 마법이 발휘됩니다. 디렉터리를 순회하면서 지원되지 않는 파일은 건너뛰고, 각 교정된 출력을 `.txt` 파일에 기록합니다. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Expected output + +이미지에 *“The quick brown fox jumps over the lazzy dog.”* 라는 문장이 포함된 경우, 다음과 같은 텍스트 파일이 생성됩니다: + +``` +The quick brown fox jumps over the lazy dog. +``` + +두 번째 “z”가 자동으로 교정된 것을 확인할 수 있습니다 – 바로 AI 맞춤법 검사 덕분입니다. + +**Why this matters:** OCR 및 AI 객체를 **한 번만** 생성하고 재사용함으로써 파일마다 모델을 로드하는 오버헤드를 피합니다. 이는 대규모 **배치 OCR**을 수행하는 가장 효율적인 방법입니다. + +--- + +## Step 5: 정리 – **AI 리소스 해제** 올바르게 수행하기 + +작업이 끝났다면 `free_resources()`를 호출해 GPU 메모리, CUDA 컨텍스트 및 모델이 만든 임시 파일을 해제합니다. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +이 단계를 생략하면 GPU 할당이 남아 이후 Python 프로세스가 충돌하거나 VRAM을 과도하게 차지할 수 있습니다. 배치 작업의 “전등 끄기” 단계라고 생각하면 됩니다. + +--- + +## Common Pitfalls & Extra Tips + +| Issue | What to Look For | Fix | +|-------|------------------|-----| +| **Out‑of‑memory errors** | GPU가 몇 십 장의 이미지를 처리한 뒤 메모리가 부족해짐 | `gpu_layers`를 낮추거나 CPU로 전환 (`model_cfg.gpu_layers = 0`). | +| **Missing language pack** | OCR이 빈 문자열을 반환 | `asposeocr` 버전에 영어 언어 데이터가 포함되어 있는지 확인하고, 필요하면 재설치. | +| **Non‑image files** | `.pdf` 등 의도치 않은 파일 때문에 스크립트가 충돌 | `if not file_name.lower().endswith(...)` 조건이 이미 해당 파일들을 건너뛰도록 구현됨. | +| **Spell‑check not applied** | 출력이 원시 OCR과 동일 | 루프 전에 `ai_processor.set_post_processor`가 호출되었는지 확인. | +| **Slow batch speed** | 이미지당 5초 이상 소요 | 첫 실행 후 `model_cfg.allow_auto_download = "false"`로 설정해 모델 재다운로드를 방지. | + +**Pro tip:** 영어가 아닌 다른 언어에서 **이미지에서 텍스트 추출**이 필요하면 `ocr_engine.language`를 해당 열거형으로 변경하면 됩니다(예: `aocr.Language.French`). 동일한 AI 후처리기가 맞춤법 검사를 적용하지만, 최상의 결과를 위해 언어별 모델을 사용하는 것이 좋습니다. + +--- + +## Recap & Next Steps + +우리는 **배치 OCR** 전체 파이프라인을 다음과 같이 정리했습니다: + +1. **Initialize** – 영어용 plain‑text OCR 엔진 초기화. +2. **Configure** – AI 맞춤법 검사 모델을 설정하고 후처리기로 바인딩. +3. **Run** – 각 이미지에 OCR을 실행하고 AI가 **OCR 오류를 자동 교정**하도록 함. +4. **Loop** – 디렉터리를 순회하며 **이미지에서 텍스트를 대량 추출**. +5. **Free AI resources** – 작업이 끝나면 리소스를 해제. + +다음과 같은 확장이 가능합니다: + +- 교정된 텍스트를 하위 NLP 파이프라인(감성 분석, 엔터티 추출 등)으로 전달 +- `ai_processor.set_post_processor(your_custom_func, {})`를 호출해 맞춤법 검사 대신 맞춤형 요약기 후처리기로 교체 +- GPU가 멀티 스트림을 지원한다면 `concurrent.futures.ThreadPoolExecutor`를 이용해 폴더 순회를 병렬화 + +--- + +## Final Thoughts + +배치 OCR은 복잡할 필요가 없습니다. Aspose OCR과 가벼운 AI 모델을 결합하면 **이미지에서 텍스트 추출**, **맞춤법 검사 적용**, **OCR 오류 교정**, **AI 리소스 해제**를 한 번에 해결하는 **원스톱 솔루션**을 얻을 수 있습니다. 테스트 폴더에서 스크립트를 실행해 보고, GPU 레이어 수를 하드웨어에 맞게 조정하면 몇 분 안에 프로덕션 수준 파이프라인을 구축할 수 있습니다. + +모델 튜닝, PDF 처리, 웹 서비스 연동 등에 대한 질문이 있으면 아래 댓글을 남기거나 GitHub에서 저에게 ping 주세요. 즐거운 코딩 되시고, OCR이 언제나 정확하기를 바랍니다! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/korean/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/korean/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..bda8fb1f5 --- /dev/null +++ b/ocr/korean/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,298 @@ +--- +category: general +date: 2026-05-03 +description: PNG 이미지 파일을 로드하고 이미지에서 텍스트를 인식하는 방법과 배치 OCR 처리를 위한 무료 AI 리소스를 보여주는 파이썬 + OCR 튜토리얼. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: ko +og_description: Python OCR 튜토리얼은 PNG 이미지를 로드하고, 이미지에서 텍스트를 인식하며, 배치 OCR 처리를 위한 무료 + AI 리소스를 활용하는 방법을 안내합니다. +og_title: Python OCR 튜토리얼 – 무료 AI 리소스로 빠른 배치 OCR +tags: +- OCR +- Python +- AI +title: 파이썬 OCR 튜토리얼 – 배치 OCR 처리 쉽게 만들기 +url: /ko/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR 튜토리얼 – 배치 OCR 처리 쉽게 만들기 + +실제로 수십 개의 PNG 파일에 OCR을 실행할 수 있는 **python ocr tutorial**이 필요했던 적 있나요? 머리카락이 빠질 정도로 고생하지 않아도 됩니다. 여러분만 그런 것이 아닙니다. 많은 실제 프로젝트에서 **load png image** 파일을 로드하고 엔진에 전달한 뒤, 작업이 끝나면 AI 리소스를 정리해야 합니다. + +이 가이드에서는 **recognize text from image** 파일을 정확히 어떻게 인식하고, 배치로 처리하며, 기본 AI 메모리를 해제하는지 보여주는 완전한 실행 가능한 예제를 단계별로 살펴보겠습니다. 끝까지 읽으면 어떤 프로젝트에도 바로 넣어 사용할 수 있는 독립형 스크립트를 얻게 됩니다—불필요한 내용 없이 핵심만 제공합니다. + +## 필요 사항 + +- Python 3.10 이상 (여기서 사용된 문법은 f‑strings와 타입 힌트에 의존합니다) +- `engine.recognize` 메서드를 제공하는 OCR 라이브러리 – 데모 목적상 가상의 `aocr` 패키지를 가정하지만, Tesseract, EasyOCR 등으로 교체할 수 있습니다. +- 코드 스니펫에 표시된 `ai` 헬퍼 모듈 (모델 초기화와 리소스 정리를 담당) +- 처리하려는 PNG 파일이 들어 있는 폴더 + +`aocr` 또는 `ai`가 설치되어 있지 않다면 스텁으로 흉내낼 수 있습니다 – 끝부분의 “Optional Stubs” 섹션을 참고하세요. + +## 단계 1: AI 엔진 초기화 (AI 리소스 해제) + +OCR 파이프라인에 이미지를 넣기 전에 기본 모델이 준비되어 있어야 합니다. 한 번만 초기화하면 메모리를 절약하고 배치 작업 속도가 빨라집니다. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**왜 중요한가:** +각 이미지마다 `ai.initialize`를 반복 호출하면 GPU 메모리가 계속 할당되어 결국 스크립트가 충돌합니다. `ai.is_initialized()`를 확인함으로써 단 한 번만 할당을 보장합니다—이것이 “AI 리소스 해제” 원칙입니다. + +## 단계 2: 배치 OCR 처리를 위한 PNG 이미지 파일 로드 + +이제 OCR에 넘길 모든 PNG 파일을 모읍니다. `pathlib`을 사용하면 코드가 OS에 구애받지 않습니다. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**예외 상황:** +폴더에 PNG가 아닌 파일(JPEG 등)이 있으면 무시되어 `engine.recognize`가 지원되지 않는 형식 때문에 오류가 발생하는 것을 방지합니다. + +## 단계 3: 각 이미지에 OCR 실행 및 후처리 적용 + +엔진이 준비되고 파일 목록이 준비되면, 이미지를 순회하면서 원시 텍스트를 추출하고 일반적인 OCR 아티팩트(예: 불필요한 줄바꿈)를 정리하는 후처리기로 전달합니다. + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**로드와 인식을 분리하는 이유:** +`aocr.Image.load`는 지연 디코딩을 수행할 수 있어 대용량 배치에서 더 빠릅니다. 로드 단계를 명시적으로 두면 나중에 JPEG나 TIFF 파일을 처리해야 할 때 다른 이미지 라이브러리로 쉽게 교체할 수 있습니다. + +## 단계 4: 배치 종료 후 정리 – AI 리소스 해제 + +배치 작업이 끝나면 메모리 누수를 방지하기 위해 모델을 해제해야 합니다. 특히 GPU가 활성화된 환경에서는 필수입니다. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## 전체 스크립트 합치기 – 완전한 예제 + +아래는 네 단계를 하나의 흐름으로 연결한 단일 파일입니다. `batch_ocr.py`라는 이름으로 저장하고 명령줄에서 실행하세요. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### 예상 출력 + +세 개의 PNG가 들어 있는 폴더에 대해 스크립트를 실행하면 다음과 같이 출력될 수 있습니다: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +`ocr_results.txt` 파일에는 각 이미지마다 명확한 구분자가 포함되고, 그 뒤에 정리된 OCR 텍스트가 기록됩니다. + +## aocr 및 ai용 선택적 스텁 (실제 패키지가 없을 경우) + +무거운 OCR 라이브러리를 사용하지 않고 흐름만 테스트하고 싶다면 최소한의 모킹 모듈을 만들 수 있습니다: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +이 폴더들을 `batch_ocr.py` 옆에 두면 스크립트가 실행되어 모의 결과를 출력합니다. + +## 전문가 팁 & 흔히 겪는 함정 + +- **Memory spikes:** 수천 개의 고해상도 PNG를 처리한다면 OCR 전에 이미지 크기를 조정하는 것을 고려하세요. `aocr.Image.load`는 종종 `max_size` 인자를 지원합니다. +- **Unicode handling:** 출력 파일을 항상 `encoding="utf-8"`로 열어야 합니다; OCR 엔진은 비ASCII 문자를 내보낼 수 있습니다. +- **Parallelism:** CPU 기반 OCR이라면 `ocr_batch`를 `concurrent.futures.ThreadPoolExecutor`로 감쌀 수 있습니다. 단, `ai` 인스턴스를 하나만 유지하세요—각 스레드가 `ai.initialize`를 호출하면 “AI 리소스 해제” 목표에 어긋납니다. +- **Error resilience:** 이미지별 루프를 `try/except` 블록으로 감싸면 하나의 손상된 PNG가 전체 배치를 중단시키는 일을 방지할 수 있습니다. + +## 결론 + +이제 **python ocr tutorial**을 통해 **load png image** 파일을 로드하고, **batch OCR processing**을 수행하며, **free AI resources**를 책임감 있게 관리하는 방법을 알게 되었습니다. 완전하고 실행 가능한 예제는 **recognize text from image** 객체를 어떻게 인식하고 이후에 정리하는지를 정확히 보여주므로, 누락된 부분을 찾지 않고 바로 프로젝트에 복사·붙여넣기 할 수 있습니다. + +다음 단계로는 스텁으로 만든 `aocr`와 `ai` 모듈을 실제 `pytesseract`·`torchvision` 같은 라이브러리로 교체해 보세요. 스크립트를 JSON 출력, 데이터베이스 저장, 클라우드 스토리지 연동 등으로 확장할 수도 있습니다. 가능성은 무한합니다—코딩을 즐기세요! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/korean/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/korean/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..7b53f6979 --- /dev/null +++ b/ocr/korean/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: 구조화된 OCR 인식을 사용하여 이미지에서 OCR을 실행하고 좌표와 함께 텍스트를 추출하는 방법을 배워보세요. 단계별 파이썬 + 코드가 포함되어 있습니다. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: ko +og_description: 이미지에서 OCR을 실행하고 구조화된 OCR 인식을 사용해 좌표와 함께 텍스트를 추출합니다. 설명이 포함된 전체 Python + 예제. +og_title: 이미지에서 OCR 실행 – 구조화된 텍스트 추출 튜토리얼 +tags: +- OCR +- Python +- Computer Vision +title: 이미지에서 OCR 실행 – 구조화된 텍스트 추출 완전 가이드 +url: /ko/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# 이미지에서 OCR 실행 – 구조화된 텍스트 추출 완전 가이드 + +이미지 파일에 **OCR을 실행**하고 각 단어의 정확한 위치를 유지하는 방법을 몰라 고민한 적 있나요? 여러분만 그런 것이 아닙니다. 영수증 스캔, 양식 디지털화, UI 테스트 등 많은 프로젝트에서 원시 텍스트뿐만 아니라 각 줄이 이미지 상에서 어디에 위치하는지를 알려주는 바운딩 박스가 필요합니다. + +이 튜토리얼에서는 **aocr** 엔진을 사용해 *이미지에서 OCR을 실행*하고 **구조화된 OCR 인식**을 요청한 뒤, 기하 정보를 보존하면서 결과를 후처리하는 실용적인 방법을 보여드립니다. 몇 줄의 파이썬 코드만으로 **좌표와 함께 텍스트를 추출**할 수 있게 되며, 구조화 모드가 다운스트림 작업에 왜 중요한지도 이해하게 될 것입니다. + +## 배울 내용 + +- **구조화된 OCR 인식**을 위한 OCR 엔진 초기화 방법. +- 이미지를 입력하고 줄 경계가 포함된 원시 결과를 받는 방법. +- 기하 정보를 잃지 않으면서 텍스트를 정제하는 후처리 방법. +- 최종 줄을 순회하며 텍스트와 바운딩 박스를 함께 출력하는 방법. + +마법도, 숨겨진 단계도 없습니다—그냥 바로 실행 가능한 예제를 여러분의 프로젝트에 복사해 넣기만 하면 됩니다. + +--- + +## 사전 요구 사항 + +시작하기 전에 다음이 설치되어 있는지 확인하세요: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +또한 읽기 쉬운 텍스트가 포함된 이미지 파일(`input_image.png` 또는 `.jpg`)이 필요합니다. 스캔한 청구서든 스크린샷이든 OCR 엔진이 문자를 인식할 수만 하면 됩니다. + +--- + +## 1단계: 구조화된 인식을 위한 OCR 엔진 초기화 + +먼저 `aocr.Engine()` 인스턴스를 만들고 **구조화된 OCR 인식**을 원한다는 것을 알려줍니다. 구조화 모드는 일반 텍스트뿐 아니라 각 줄에 대한 기하 데이터(바운딩 사각형)도 반환하므로, 텍스트를 이미지에 다시 매핑해야 할 때 필수적입니다. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **왜 중요한가:** +> 기본 모드에서는 엔진이 단순히 이어붙인 문자열만 반환할 수 있습니다. 구조화 모드는 페이지 → 줄 → 단어 계층 구조와 좌표를 제공하므로 원본 이미지 위에 결과를 오버레이하거나 레이아웃을 인식하는 모델에 바로 전달하기가 훨씬 쉬워집니다. + +--- + +## 2단계: 이미지에 OCR 실행 및 원시 결과 획득 + +이제 이미지를 엔진에 전달합니다. `recognize` 호출은 `OcrResult` 객체를 반환하며, 여기에는 각 줄마다 바운딩 사각형이 포함된 컬렉션이 들어 있습니다. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +이 시점에서 `raw_result.lines`는 두 가지 중요한 속성을 가진 객체들을 보유합니다: + +- `text` – 해당 줄에 인식된 문자열. +- `bounds` – `(x, y, width, height)` 형태의 튜플로 줄의 위치를 설명합니다. + +--- + +## 3단계: 기하 정보를 보존하면서 후처리 + +원시 OCR 출력은 종종 잡음이 많습니다: 불필요한 문자, 잘못된 공백, 줄바꿈 문제 등. `ai.run_postprocessor` 함수는 텍스트를 정제하지만 **원본 기하 정보를 그대로 유지**하므로 정확한 좌표를 계속 사용할 수 있습니다. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **프로 팁:** 도메인별 어휘(예: 제품 코드)가 있다면 사용자 정의 사전을 후처리기에 전달해 정확도를 높이세요. + +--- + +## 4단계: 좌표와 함께 텍스트 추출 – 순회 및 출력 + +마지막으로 정제된 줄들을 순회하면서 각 줄의 바운딩 박스를 텍스트와 함께 출력합니다. 이것이 **좌표와 함께 텍스트를 추출**하는 핵심 로직입니다. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### 예상 출력 + +입력 이미지에 두 줄 “Invoice #12345”와 “Total: $89.99”가 포함되어 있다고 가정하면 다음과 유사한 결과가 표시됩니다: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +첫 번째 튜플은 원본 이미지 상에서 해당 줄의 `(x, y, width, height)`이며, 이를 이용해 사각형을 그리거나 텍스트를 강조하거나 좌표를 다른 시스템에 전달할 수 있습니다. + +--- + +## 결과 시각화 (선택 사항) + +이미지 위에 바운딩 박스를 오버레이하고 싶다면 Pillow(PIL)를 사용해 사각형을 그릴 수 있습니다. 아래는 간단한 스니펫이며, 원시 데이터만 필요하면 건너뛰어도 됩니다. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![이미지에서 OCR 실행 예시 – 바운딩 박스 표시](/images/ocr-bounding-boxes.png "이미지에서 OCR 실행 – 바운딩 박스 오버레이") + +위 alt 텍스트에는 **주요 키워드**가 포함되어 있어 이미지 alt 속성에 대한 SEO 요구사항을 충족합니다. + +--- + +## 구조화된 OCR 인식이 단순 텍스트 추출보다 뛰어난 이유 + +“그냥 OCR만 하면 텍스트를 얻을 수 있지 않나요? 왜 좌표가 필요하죠?” 라고 생각할 수 있습니다. + +- **공간적 컨텍스트:** 양식에서 “날짜”와 같은 필드가 실제 값 옆에 위치할 때, 좌표가 데이터가 어디에 있는지 알려줍니다. +- **다중 컬럼 레이아웃:** 단순 선형 텍스트는 순서를 잃지만, 구조화된 데이터는 컬럼 순서를 보존합니다. +- **후처리 정확도:** 박스 크기를 알면 단어가 헤더인지, 각주인지, 혹은 잡음인지 판단하기 쉽습니다. + +요컨대, **구조화된 OCR 인식**은 데이터베이스 입력, 검색 가능한 PDF 생성, 레이아웃을 고려한 머신러닝 모델 학습 등 더 스마트한 파이프라인을 구축할 수 있는 유연성을 제공합니다. + +--- + +## 흔히 마주치는 엣지 케이스와 해결 방법 + +| 상황 | 주의할 점 | 권장 해결책 | +|-----------|-------------------|---------------| +| **회전되거나 기울어진 이미지** | 바운딩 박스가 축을 벗어날 수 있음 | OpenCV `warpAffine` 등으로 디스키유(Deskew) 전처리 | +| **극히 작은 글꼴** | 엔진이 문자를 놓쳐 빈 줄이 생김 | 이미지 해상도 상승 또는 `ocr_engine.set_dpi(300)` 사용 | +| **다중 언어 혼합** | 잘못된 언어 모델 선택 시 텍스트가 깨짐 | 인식 전에 `ocr_engine.language = ["en", "de"]` 등 설정 | +| **중첩된 박스** | 후처리 단계에서 두 줄이 의도치 않게 합쳐질 수 있음 | 처리 후 `line.bounds` 확인; `ai.run_postprocessor` 임계값 조정 | + +초기에 이러한 상황을 대비하면, 하루에 수백 개의 문서를 처리하더라도 큰 어려움을 피할 수 있습니다. + +--- + +## 전체 엔드‑투‑엔드 스크립트 + +아래는 모든 단계를 하나로 묶은 완전 실행 가능한 프로그램입니다. 복사‑붙여넣기 후 이미지 경로만 수정하면 바로 사용할 수 있습니다. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +이 스크립트를 실행하면: + +1. 구조화 모드로 **이미지에서 OCR 실행**. +2. 모든 줄에 대해 **좌표와 함께 텍스트 추출**. +3. 선택적으로 박스가 표시된 PNG를 생성. + +--- + +## 결론 + +이제 **이미지에서 OCR을 실행**하고 **구조화된 OCR 인식**을 활용해 **좌표와 함께 텍스트를 추출**하는 완전하고 자체 포함된 솔루션을 갖추었습니다. 엔진 초기화부터 후처리, 시각적 검증까지 모든 단계가 코드에 포함되어 있어 영수증, 양식, 혹은 정확한 텍스트 위치 지정이 필요한 모든 시각 문서에 적용할 수 있습니다. + +다음 단계는 무엇일까요? `aocr` 엔진을 Tesseract, EasyOCR 등 다른 라이브러리로 교체해 구조화된 출력이 어떻게 다른지 실험해 보세요. 도메인에 맞는 맞춤형 정규식 필터나 맞춤법 검사와 같은 후처리 전략을 추가해 정확도를 더욱 높일 수 있습니다. 또한 더 큰 파이프라인을 구축한다면 `(text, bounds)` 쌍을 데이터베이스에 저장해 추후 분석에 활용해 보세요. + +코딩을 즐기시고, 여러분의 OCR 프로젝트가 언제나 정확하기를 바랍니다! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/polish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/polish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..684f5a047 --- /dev/null +++ b/ocr/polish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,231 @@ +--- +category: general +date: 2026-05-03 +description: wyodrębniaj tekst z obrazu przy użyciu Aspose OCR i AI spell‑check. Dowiedz + się, jak wykonać OCR obrazu, załadować obraz do OCR, rozpoznać tekst z faktury i + zwolnić zasoby GPU. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: pl +og_description: wyodrębnij tekst z obrazu za pomocą Aspose OCR i AI spell‑check. Przewodnik + krok po kroku opisujący, jak wykonać OCR obrazu, załadować obraz do OCR i zwolnić + zasoby GPU. +og_title: wyodrębnianie tekstu z obrazu – Kompletny przewodnik po OCR i sprawdzaniu + pisowni +tags: +- OCR +- Aspose +- AI +- Python +title: wyodrębnij tekst z obrazu – OCR z Aspose AI Spell‑Check +url: /pl/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Wyodrębnianie tekstu z obrazu – Kompletny przewodnik OCR i sprawdzania pisowni + +Czy kiedykolwiek potrzebowałeś **wyodrębnić tekst z obrazu**, ale nie byłeś pewien, która biblioteka zapewni zarówno szybkość, jak i dokładność? Nie jesteś jedyny. W wielu rzeczywistych projektach — myśl o przetwarzaniu faktur, digitalizacji paragonów lub skanowaniu umów — uzyskanie czystego, przeszukiwalnego tekstu z obrazu jest pierwszą przeszkodą. + +Dobrą wiadomością jest to, że Aspose OCR w połączeniu z lekkim modelem Aspose AI może wykonać to zadanie w kilku linijkach Pythona. W tym samouczku przeprowadzimy Cię przez **jak wykonać OCR obrazu**, załadujemy obraz poprawnie, uruchomimy wbudowany procesor sprawdzania pisowni i w końcu **zwolnić zasoby GPU**, aby Twoja aplikacja była przyjazna dla pamięci. + +Po zakończeniu tego przewodnika będziesz w stanie **rozpoznawać tekst z faktur** obrazów, automatycznie korygować typowe błędy OCR i utrzymać swój GPU w czystości dla kolejnej partii. + +--- + +## Czego będziesz potrzebować + +- Python 3.9 lub nowszy (kod używa podpowiedzi typów, ale działa także w starszych wersjach 3.x) +- pakiety `aspose-ocr` i `aspose-ai` (instaluj za pomocą `pip install aspose-ocr aspose-ai`) +- GPU z obsługą CUDA jest opcjonalny; skrypt przełączy się na CPU, jeśli nie zostanie wykryty. +- Przykładowy obraz, np. `sample_invoice.png`, umieszczony w folderze, do którego możesz odwołać się. + +Brak ciężkich frameworków ML, brak ogromnych pobrań modeli — tylko mały, kwantowany model Q4‑K‑M, który wygodnie mieści się na większości GPU. + +--- + +## Krok 1: Inicjalizacja silnika OCR – wyodrębnianie tekstu z obrazu + +Pierwszą rzeczą, którą robisz, jest stworzenie instancji `OcrEngine` i określenie, jakiego języka oczekujesz. Tutaj wybieramy angielski i żądamy wyjścia w formacie plain‑text, co jest idealne do dalszego przetwarzania. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Dlaczego to ważne:** Ustawienie języka ogranicza zestaw znaków, zwiększając dokładność. Tryb plain‑text usuwa informacje o układzie, które zazwyczaj nie są potrzebne, gdy chcesz po prostu wyodrębnić tekst z obrazu. + +--- + +## Krok 2: Ładowanie obrazu do OCR – jak wykonać OCR obrazu + +Teraz podajemy silnikowi rzeczywisty obraz. Pomocnicza funkcja `Image.load` obsługuje popularne formaty (PNG, JPEG, TIFF) i ukrywa szczegóły związane z operacjami we/wy. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Wskazówka:** Jeśli Twoje obrazy źródłowe są duże, rozważ ich zmniejszenie przed przekazaniem do silnika; mniejsze wymiary mogą zmniejszyć zużycie pamięci GPU, nie wpływając negatywnie na jakość rozpoznawania. + +--- + +## Krok 3: Konfiguracja modelu Aspose AI – rozpoznawanie tekstu z faktury + +Aspose AI dostarcza mały model GGUF, który można automatycznie pobrać. Przykład używa repozytorium `Qwen2.5‑3B‑Instruct‑GGUF`, kwantowanego do `q4_k_m`. Dodatkowo informujemy środowisko wykonawcze, aby przydzieliło 20 warstw na GPU, co równoważy szybkość i zużycie VRAM. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Za kulisami:** Kwantowany model zajmuje około 1,5 GB na dysku, co jest ułamkiem pełnoprecyzyjnego modelu, a mimo to zachowuje wystarczającą subtelność językową, aby wykrywać typowe błędy OCR. + +--- + +## Krok 4: Inicjalizacja AsposeAI i podłączenie procesora sprawdzania pisowni + +Aspose AI zawiera gotowy procesor sprawdzania pisowni. Po podłączeniu, każdy wynik OCR zostanie automatycznie oczyszczony. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Dlaczego używać procesora post‑processingowego?** Silniki OCR często mylą „Invoice” z „Invo1ce” lub „Total” z „T0tal”. Sprawdzanie pisowni uruchamia lekki model językowy na surowym ciągu znaków i koryguje te błędy, bez konieczności tworzenia własnego słownika. + +--- + +## Krok 5: Uruchomienie procesora sprawdzania pisowni na wyniku OCR + +Po podłączeniu wszystkiego, jedno wywołanie zwraca poprawiony tekst. Drukujemy także zarówno oryginalną, jak i oczyszczoną wersję, abyś mógł zobaczyć różnicę. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Typowy wynik dla faktury może wyglądać tak: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Zauważ, że „Invo1ce” zamieniło się w poprawne słowo „Invoice”. To moc wbudowanego sprawdzania pisowni AI. + +--- + +## Krok 6: Zwolnienie zasobów GPU – bezpieczne zwalnianie zasobów GPU + +Jeśli uruchamiasz to w długotrwałej usłudze (np. w API webowym przetwarzającym dziesiątki faktur na minutę), musisz zwolnić kontekst GPU po każdej partii. W przeciwnym razie pojawią się wycieki pamięci i w końcu błędy „CUDA out of memory”. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Pro tip:** Wywołaj `free_resources()` wewnątrz bloku `finally` lub menedżera kontekstu, aby zawsze się wykonał, nawet w przypadku wystąpienia wyjątku. + +--- + +## Pełny działający przykład + +Połączenie wszystkich elementów daje Ci samodzielny skrypt, który możesz wkleić do dowolnego projektu. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Zapisz plik, dostosuj ścieżkę do swojego obrazu i uruchom `python extract_text_from_image.py`. Powinieneś zobaczyć wyczyszczony tekst faktury wypisany w konsoli. + +--- + +## Najczęściej zadawane pytania (FAQ) + +**Q: Czy to działa na maszynach tylko z CPU?** +A: Zdecydowanie tak. Jeśli nie wykryto GPU, Aspose AI przełącza się na wykonanie na CPU, choć będzie wolniejsze. Możesz wymusić CPU, ustawiając `model_cfg.gpu_layers = 0`. + +**Q: Co jeśli moje faktury są w języku innym niż angielski?** +A: Zmień `ocr_engine.language` na odpowiednią wartość wyliczeniową (np. `aocr.Language.Spanish`). Model sprawdzania pisowni jest wielojęzyczny, ale możesz uzyskać lepsze wyniki przy modelu specyficznym dla języka. + +**Q: Czy mogę przetwarzać wiele obrazów w pętli?** +A: Tak. Po prostu przenieś kroki ładowania, rozpoznawania i post‑processingu do pętli `for`. Pamiętaj, aby wywołać `ocr_ai.free_resources()` po pętli lub po każdej partii, jeśli ponownie używasz tej samej instancji AI. + +**Q: Jak duży jest rozmiar pobieranego modelu?** +A: Około 1,5 GB dla wersji kwantowanej `q4_k_m`. Jest buforowany po pierwszym uruchomieniu, więc kolejne wykonania są natychmiastowe. + +--- + +## Podsumowanie + +W tym samouczku pokazaliśmy, jak **wyodrębnić tekst z obrazu** przy użyciu Aspose OCR, skonfigurować mały model AI, zastosować procesor sprawdzania pisowni i bezpiecznie **zwolnić zasoby GPU**. Przepływ pracy obejmuje wszystko, od ładowania obrazu po sprzątanie po sobie, zapewniając niezawodny pipeline dla scenariuszy **rozpoznawania tekstu z faktur**. + +Kolejne kroki? Spróbuj zamienić sprawdzanie pisowni na własny model ekstrakcji encji + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/polish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/polish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..f6215785d --- /dev/null +++ b/ocr/polish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,217 @@ +--- +category: general +date: 2026-05-03 +description: Jak przetwarzać obrazy metodą OCR w trybie wsadowym przy użyciu Aspose + OCR i AI sprawdzania pisowni. Dowiedz się, jak wyodrębniać tekst z obrazów, stosować + sprawdzanie pisowni, korzystać z darmowych zasobów AI i korygować błędy OCR. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: pl +og_description: Jak przetwarzać obrazy OCR wsadowo przy użyciu Aspose OCR i AI sprawdzania + pisowni. Postępuj zgodnie z przewodnikiem krok po kroku, aby wyodrębnić tekst z + obrazów, zastosować sprawdzanie pisowni, korzystać z darmowych zasobów AI i poprawić + błędy OCR. +og_title: Jak wykonywać OCR wsadowo przy użyciu Aspose OCR – Kompletny samouczek w + Pythonie +tags: +- OCR +- Python +- AI +- Aspose +title: Jak przeprowadzić wsadowe OCR przy użyciu Aspose OCR – Pełny przewodnik w Pythonie +url: /pl/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Jak wykonywać OCR wsadowo przy użyciu Aspose OCR – Pełny przewodnik w Pythonie + +Zastanawiałeś się kiedyś **jak wykonywać OCR wsadowo** na całym folderze zeskanowanych PDF‑ów lub zdjęć, nie pisząc osobnego skryptu dla każdego pliku? Nie jesteś sam. W wielu rzeczywistych pipeline’ach będziesz musiał **wyodrębniać tekst z obrazów**, usuwać błędy ortograficzne i w końcu zwolnić wszelkie zasoby AI, które przydzieliłeś. Ten tutorial pokazuje dokładnie, jak to zrobić przy użyciu Aspose OCR, lekkiego post‑procesora AI oraz kilku linijek Pythona. + +Przejdziemy przez inicjalizację silnika OCR, podłączenie korektora ortograficznego AI, iterację po katalogu zdjęć oraz sprzątanie modelu po zakończeniu. Po zakończeniu będziesz mieć gotowy do uruchomienia skrypt, który automatycznie **koryguje błędy OCR** i zwalnia **darmowe zasoby AI**, aby Twój GPU był zadowolony. + +## Czego będziesz potrzebować + +- Python 3.9+ (kod używa podpowiedzi typów, ale działa na wcześniejszych wersjach 3.x) +- Pakiet `asposeocr` (`pip install asposeocr`) – zapewnia silnik OCR. +- Dostęp do modelu Hugging Face `bartowski/Qwen2.5-3B-Instruct-GGUF` (pobierany automatycznie). +- GPU z przynajmniej kilkoma GB VRAM (skrypt ustawia `gpu_layers = 30`, możesz zmniejszyć, jeśli potrzebne). + +Brak zewnętrznych usług, brak płatnych API – wszystko działa lokalnie. + +--- + +## Krok 1: Konfiguracja silnika OCR – **Jak wykonywać OCR wsadowo** efektywnie + +Zanim będziemy mogli przetworzyć tysiąc obrazów, potrzebujemy solidnego silnika OCR. Aspose OCR pozwala wybrać język i tryb rozpoznawania w jednym wywołaniu. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Dlaczego to ważne:** Ustawienie `recognize_mode` na `Plain` utrzymuje wyjście lekkie, co jest idealne, gdy planujesz później uruchomić sprawdzanie pisowni. Jeśli potrzebujesz informacji o układzie, przełączyłbyś się na `Layout`, ale to dodaje narzut, którego prawdopodobnie nie chcesz w zadaniu wsadowym. + +> **Pro tip:** Jeśli masz do czynienia ze skanami wielojęzycznymi, możesz przekazać listę taką jak `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Krok 2: Inicjalizacja post‑procesora AI – **Zastosuj sprawdzanie pisowni** do wyniku OCR + +Aspose AI dostarcza wbudowany post‑procesor, który może uruchomić dowolny model. Tutaj pobieramy kwantyzowany model Qwen 2.5 z Hugging Face i podłączamy procedurę sprawdzania pisowni. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Dlaczego to ważne:** Model jest kwantyzowany (`q4_k_m`), co znacznie zmniejsza zużycie pamięci, jednocześnie zapewniając przyzwoite rozumienie języka. Wywołując `set_post_processor`, informujemy Aspose AI, aby automatycznie wykonywał krok **apply spell check** na dowolnym ciągu, który mu przekażemy. + +> **Uwaga:** Jeśli Twój GPU nie radzi sobie z 30 warstwami, zmniejsz liczbę do 15 lub nawet 5 – skrypt nadal będzie działał, tylko nieco wolniej. + +--- + +## Krok 3: Uruchom OCR i **koryguj błędy OCR** na pojedynczym obrazie + +Teraz, gdy silnik OCR i korektor ortograficzny AI są gotowe, łączymy je. Ta funkcja wczytuje obraz, wyodrębnia surowy tekst, a następnie uruchamia post‑procesor AI, aby go oczyścić. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Dlaczego to ważne:** Bezpośrednie podanie surowego ciągu OCR do modelu AI daje nam etap **correct OCR errors** bez pisania jakichkolwiek wyrażeń regularnych czy własnych słowników. Model rozumie kontekst, więc może naprawić „recieve” → „receive” i jeszcze subtelniejsze błędy. + +--- + +## Krok 4: **Wyodrębniaj tekst z obrazów** masowo – prawdziwa pętla wsadowa + +Tutaj magia **jak wykonywać OCR wsadowo** błyszczy. Iterujemy po katalogu, pomijamy nieobsługiwane pliki i zapisujemy każde poprawione wyjście do pliku `.txt`. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Oczekiwany wynik + +Dla obrazu zawierającego zdanie *„The quick brown fox jumps over the lazzy dog.”* zobaczysz plik tekstowy z: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Zauważ, że podwójne „z” zostało automatycznie skorygowane – to działanie AI spell‑check. + +**Dlaczego to ważne:** Tworząc obiekty OCR i AI **jednorazowo** i ponownie ich używając, unikamy narzutu ładowania modelu dla każdego pliku. To najefektywniejszy sposób na **jak wykonywać OCR wsadowo** w skali. + +--- + +## Krok 5: Sprzątanie – **Zwolnij zasoby AI** prawidłowo + +Gdy skończysz, wywołanie `free_resources()` zwalnia pamięć GPU, konteksty CUDA oraz wszelkie tymczasowe pliki utworzone przez model. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Pominięcie tego kroku może pozostawić wiszące alokacje GPU, co może spowodować awarię kolejnych procesów Pythona lub zająć VRAM. Traktuj to jako część „wyłączenia świateł” w zadaniu wsadowym. + +--- + +## Typowe problemy i dodatkowe wskazówki + +| Issue | What to Look For | Fix | +|-------|------------------|-----| +| **Błędy braku pamięci** | GPU wyczerpuje się po kilku dziesiątkach obrazów | Zredukuj `gpu_layers` lub przełącz na CPU (`model_cfg.gpu_layers = 0`). | +| **Brak pakietu językowego** | OCR zwraca puste ciągi | Upewnij się, że wersja `asposeocr` zawiera dane języka angielskiego; w razie potrzeby reinstaluj. | +| **Pliki nie będące obrazami** | Skrypt wyłącza się przy przypadkowym pliku `.pdf` | Warunek `if not file_name.lower().endswith(...)` już je pomija. | +| **Sprawdzanie pisowni nie zastosowane** | Wyjście wygląda identycznie jak surowy OCR | Sprawdź, czy `ai_processor.set_post_processor` został wywołany przed pętlą. | +| **Powolna prędkość wsadowa** | Trwa >5 sekund na obraz | Włącz `model_cfg.allow_auto_download = "false"` po pierwszym uruchomieniu, aby model nie był pobierany ponownie przy każdym uruchomieniu. | + +**Pro tip:** Jeśli potrzebujesz **wyodrębniać tekst z obrazów** w języku innym niż angielski, po prostu zmień `ocr_engine.language` na odpowiedni enum (np. `aocr.Language.French`). Ten sam post‑procesor AI nadal zastosuje sprawdzanie pisowni, ale możesz chcieć model specyficzny dla języka, aby uzyskać najlepsze wyniki. + +--- + +## Podsumowanie i kolejne kroki + +Omówiliśmy cały pipeline dla **jak wykonywać OCR wsadowo**: + +1. **Initialize** silnik OCR zwracający czysty tekst dla języka angielskiego. +2. **Configure** model AI do sprawdzania pisowni i powiąż go jako post‑procesor. +3. **Run** OCR na każdym obrazie i pozwól AI **korygować błędy OCR** automatycznie. +4. **Loop** po katalogu, aby **wyodrębniać tekst z obrazów** masowo. +5. **Free AI resources** po zakończeniu zadania. + +Z tego miejsca możesz: + +- Przekierować poprawiony tekst do dalszego pipeline’u NLP (analiza sentymentu, ekstrakcja encji, itp.). +- Zamienić post‑procesor sprawdzania pisowni na własny streszczacz, wywołując `ai_processor.set_post_processor(your_custom_func, {})`. +- Zrównoleglić pętlę folderu przy użyciu `concurrent.futures.ThreadPoolExecutor`, jeśli Twój GPU może obsłużyć wiele strumieni. + +--- + +## Końcowe przemyślenia + +Wykonywanie OCR wsadowo nie musi być uciążliwe. Korzystając z Aspose OCR razem z lekkim modelem AI, otrzymujesz **kompleksowe rozwiązanie**, które **wyodrębnia tekst z obrazów**, **stosuje sprawdzanie pisowni**, **koryguje błędy OCR** i **czysto zwalnia zasoby AI**. Wypróbuj skrypt na folderze testowym, dostosuj liczbę warstw GPU do swojego sprzętu i będziesz mieć gotowy do produkcji pipeline w kilka minut. + +Masz pytania dotyczące dostosowywania modelu, obsługi PDF‑ów lub integracji tego w usłudze webowej? zostaw komentarz poniżej lub napisz do mnie na GitHubie. Szczęśliwego kodowania i niech Twój OCR będzie zawsze precyzyjny! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/polish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/polish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..2880dad69 --- /dev/null +++ b/ocr/polish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,300 @@ +--- +category: general +date: 2026-05-03 +description: Samouczek OCR w Pythonie, który pokazuje, jak wczytywać pliki PNG, rozpoznawać + tekst z obrazu oraz darmowe zasoby AI do przetwarzania OCR wsadowego. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: pl +og_description: Samouczek OCR w Pythonie prowadzi Cię przez ładowanie obrazów PNG, + rozpoznawanie tekstu z obrazu oraz korzystanie z darmowych zasobów AI do przetwarzania + OCR wsadowego. +og_title: Samouczek OCR w Pythonie – Szybkie przetwarzanie wsadowe OCR z darmowymi + zasobami AI +tags: +- OCR +- Python +- AI +title: Samouczek OCR w Pythonie – Łatwe przetwarzanie OCR wsadowego +url: /pl/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR Tutorial – Batch OCR Processing Made Easy + +Kiedykolwiek potrzebowałeś **python ocr tutorial**, który naprawdę pozwala uruchomić OCR na dziesiątkach plików PNG bez wyrywania włosów? Nie jesteś sam. W wielu rzeczywistych projektach musisz **load png image** pliki, przekazać je do silnika, a potem posprzątać zasoby AI, gdy skończysz. + +W tym przewodniku przejdziemy krok po kroku przez kompletny, gotowy do uruchomienia przykład, który pokazuje dokładnie, jak **recognize text from image** plików, przetwarzać je wsadowo i zwolnić pamięć AI. Na koniec będziesz mieć samodzielny skrypt, który możesz wrzucić do dowolnego projektu — bez dodatkowego balastu, tylko najważniejsze elementy. + +## What You’ll Need + +- Python 3.10 lub nowszy (użyta tutaj składnia opiera się na f‑strings i type hints) +- Biblioteka OCR, która udostępnia metodę `engine.recognize` – w celach demonstracyjnych przyjmiemy fikcyjny pakiet `aocr`, ale możesz podmienić go na Tesseract, EasyOCR itp. +- Moduł pomocniczy `ai` pokazany w fragmencie kodu (obsługuje inicjalizację modelu i czyszczenie zasobów) +- Folder pełen plików PNG, które chcesz przetworzyć + +Jeśli nie masz zainstalowanego `aocr` lub `ai`, możesz je zasymulować przy pomocy stubów – zobacz sekcję „Optional Stubs” pod koniec. + +## Step 1: Initialize the AI Engine (Free AI Resources) + +Zanim przekażesz jakikolwiek obraz do potoku OCR, podkładowy model musi być gotowy. Inicjalizacja tylko raz oszczędza pamięć i przyspiesza zadania wsadowe. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Why this matters:** +Wywoływanie `ai.initialize` wielokrotnie dla każdego obrazu przydzielałoby pamięć GPU za każdym razem, co w końcu doprowadziłoby do awarii skryptu. Sprawdzając `ai.is_initialized()` zapewniamy jednorazową alokację – to zasada „free AI resources”. + +## Step 2: Load PNG Image Files for Batch OCR Processing + +Teraz zbieramy wszystkie pliki PNG, które chcemy poddać OCR. Użycie `pathlib` utrzymuje kod niezależny od systemu operacyjnego. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Edge case:** +Jeśli folder zawiera pliki nie‑PNG (np. JPEG), zostaną one zignorowane, co zapobiega „zadławieniu” `engine.recognize` nieobsługiwanym formatem. + +## Step 3: Run OCR on Each Image and Apply Post‑Processing + +Gdy silnik jest gotowy, a lista plików przygotowana, możemy przejść po obrazach, wyodrębnić surowy tekst i przekazać go do post‑procesora, który usuwa typowe artefakty OCR (np. niechciane podziały linii). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Why we separate loading from recognition:** +`aocr.Image.load` może wykonywać leniwe dekodowanie, co jest szybsze przy dużych partiach. Utrzymanie kroku ładowania jako osobnego etapu ułatwia także podmianę biblioteki obrazu, jeśli później będziesz potrzebował obsługiwać JPEG lub TIFF. + +## Step 4: Clean Up – Free AI Resources After the Batch + +Po zakończeniu przetwarzania wsadowego musimy zwolnić model, aby uniknąć wycieków pamięci, szczególnie na maszynach z GPU. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Putting It All Together – The Complete Script + +Poniżej znajduje się pojedynczy plik, który łączy cztery kroki w spójny przepływ pracy. Zapisz go jako `batch_ocr.py` i uruchom z wiersza poleceń. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Expected Output + +Uruchomienie skryptu w folderze zawierającym trzy pliki PNG może wypisać: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +Plik `ocr_results.txt` będzie zawierał wyraźny separator dla każdego obrazu, po którym nastąpi wyczyszczony tekst OCR. + +## Optional Stubs for aocr & ai (If You Don’t Have Real Packages) + +Jeśli chcesz po prostu przetestować przepływ bez wciągania ciężkich bibliotek OCR, możesz stworzyć minimalne moduły mock: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Umieść te foldery obok `batch_ocr.py`, a skrypt uruchomi się, wypisując wyniki mock. + +## Pro Tips & Common Pitfalls + +- **Memory spikes:** Jeśli przetwarzasz tysiące wysokiej rozdzielczości PNG, rozważ ich zmniejszenie przed OCR. `aocr.Image.load` często przyjmuje argument `max_size`. +- **Unicode handling:** Zawsze otwieraj plik wyjściowy z `encoding="utf-8"`; silniki OCR mogą emitować znaki spoza ASCII. +- **Parallelism:** Dla OCR obciążającego CPU możesz opakować `ocr_batch` w `concurrent.futures.ThreadPoolExecutor`. Pamiętaj tylko, aby utrzymać jedną instancję `ai` – uruchamianie wielu wątków, które każdy wywołuje `ai.initialize`, podważa cel „free AI resources”. +- **Error resilience:** Owiń pętlę przetwarzającą poszczególne obrazy w blok `try/except`, aby pojedynczy uszkodzony PNG nie przerwał całego wsadu. + +## Conclusion + +Masz teraz **python ocr tutorial**, który pokazuje, jak **load png image** pliki, wykonać **batch OCR processing** i odpowiedzialnie zarządzać **free AI resources**. Kompletny, uruchamialny przykład pokazuje dokładnie, jak **recognize text from image** obiektów i posprzątać po sobie, więc możesz go skopiować‑wkleić do własnych projektów bez poszukiwania brakujących elementów. + +Gotowy na kolejny krok? Spróbuj podmienić stubowane moduły `aocr` i `ai` na prawdziwe biblioteki, takie jak `pytesseract` i `torchvision`. Możesz także rozbudować skrypt o wyjście w formacie JSON, wysyłanie wyników do bazy danych lub integrację z chmurowym bucketem. Nie ma granic — miłego kodowania! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/polish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/polish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..0c82c1283 --- /dev/null +++ b/ocr/polish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: Dowiedz się, jak uruchomić OCR na obrazie i wyodrębnić tekst z współrzędnymi + przy użyciu strukturalnego rozpoznawania OCR. Dołączony krok po kroku kod w Pythonie. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: pl +og_description: Uruchom OCR na obrazie i uzyskaj tekst z współrzędnymi, korzystając + ze strukturalnego rozpoznawania OCR. Pełny przykład w Pythonie z wyjaśnieniami. +og_title: Uruchom OCR na obrazie – Samouczek wyodrębniania tekstu strukturalnego +tags: +- OCR +- Python +- Computer Vision +title: Uruchom OCR na obrazie – Kompletny przewodnik po ekstrakcji tekstu strukturalnego +url: /pl/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Uruchom OCR na obrazie – Kompletny przewodnik po wyodrębnianiu tekstu strukturalnego + +Czy kiedykolwiek potrzebowałeś **uruchomić OCR na obrazie** i nie byłeś pewien, jak zachować dokładne pozycje każdego słowa? Nie jesteś sam. W wielu projektach — skanowanie paragonów, digitalizacja formularzy czy testowanie UI — potrzebujesz nie tylko surowego tekstu, ale także prostokątów ograniczających (bounding boxes), które mówią, gdzie znajduje się każda linia na obrazie. + +Ten tutorial pokazuje praktyczny sposób na *uruchomienie OCR na obrazie* przy użyciu silnika **aocr**, żądanie **structured OCR recognition**, a następnie post‑procesowanie wyniku przy zachowaniu geometrii. Po zakończeniu będziesz w stanie **wyodrębnić tekst z współrzędnymi** w kilku linijkach Pythona i zrozumiesz, dlaczego tryb strukturalny ma znaczenie dla dalszych zadań. + +## What You’ll Learn + +- Jak zainicjalizować silnik OCR dla **structured OCR recognition**. +- Jak podać obraz i otrzymać surowe wyniki zawierające granice linii. +- Jak uruchomić post‑procesor, który czyści tekst bez utraty geometrii. +- Jak iterować po ostatecznych liniach i wypisywać każdy fragment tekstu wraz z jego bounding boxem. + +Bez magii, bez ukrytych kroków — po prostu kompletny, gotowy do uruchomienia przykład, który możesz wkleić do własnego projektu. + +--- + +## Prerequisites + +Zanim przejdziemy dalej, upewnij się, że masz zainstalowane następujące elementy: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Będziesz także potrzebował pliku obrazu (`input_image.png` lub `.jpg`) zawierającego wyraźny, czytelny tekst. Wszystko, od zeskanowanej faktury po zrzut ekranu, będzie działało, o ile silnik OCR będzie w stanie rozpoznać znaki. + +--- + +## Step 1: Initialise the OCR engine for structured recognition + +Pierwszą rzeczą, którą robimy, jest stworzenie instancji `aocr.Engine()` i poinformowanie jej, że chcemy **structured OCR recognition**. Tryb strukturalny zwraca nie tylko czysty tekst, ale także dane geometryczne (prostokąty ograniczające) dla każdej linii, co jest niezbędne, gdy musisz odwzorować tekst z powrotem na obraz. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Why this matters:** +> W trybie domyślnym silnik może zwrócić jedynie łańcuch połączonych słów. Tryb strukturalny dostarcza hierarchię stron → linii → słów, z koordynatami, co znacznie ułatwia nakładanie wyników na oryginalny obraz lub przekazywanie ich do modelu uwzględniającego układ. + +--- + +## Step 2: Run OCR on the image and obtain raw results + +Teraz podajemy obraz do silnika. Wywołanie `recognize` zwraca obiekt `OcrResult`, który zawiera kolekcję linii, z których każda ma własny prostokąt ograniczający. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +W tym momencie `raw_result.lines` zawiera obiekty z dwoma ważnymi atrybutami: + +- `text` – rozpoznany ciąg znaków dla danej linii. +- `bounds` – krotka w formacie `(x, y, width, height)` opisująca pozycję linii. + +--- + +## Step 3: Post‑process while preserving geometry + +Surowe wyniki OCR są często zaszumione: niechciane znaki, nieprawidłowe spacje lub problemy z podziałem linii. Funkcja `ai.run_postprocessor` czyści tekst, ale **zachowuje oryginalną geometrię**, więc nadal masz dokładne współrzędne. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Pro tip:** Jeśli posiadasz słowniki specyficzne dla domeny (np. kody produktów), podaj własny słownik do post‑procesora, aby zwiększyć dokładność. + +--- + +## Step 4: Extract text with coordinates – iterate and display + +Na koniec przechodzimy po wyczyszczonych liniach, wypisując każdy bounding box razem z tekstem. To jest sedno **extract text with coordinates**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Expected Output + +Zakładając, że obraz wejściowy zawiera dwie linie: „Invoice #12345” i „Total: $89.99”, zobaczysz coś w tym stylu: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +Pierwsza krotka to `(x, y, width, height)` linii na oryginalnym obrazie, co pozwala rysować prostokąty, podświetlać tekst lub przekazywać współrzędne do innego systemu. + +--- + +## Visualising the Result (Optional) + +Jeśli chcesz zobaczyć bounding boxy nałożone na obraz, możesz użyć Pillow (PIL) do rysowania prostokątów. Poniżej szybki fragment kodu; możesz go pominąć, jeśli potrzebujesz jedynie surowych danych. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![uruchom OCR na obrazie – przykład z bounding boxami](/images/ocr-bounding-boxes.png "uruchom OCR na obrazie – nakładka z bounding boxami") + +Tekst alternatywny powyżej zawiera **główne słowo kluczowe**, spełniając wymóg SEO dla atrybutów alt obrazów. + +--- + +## Why Structured OCR Recognition Beats Simple Text Extraction + +Możesz się zastanawiać: „Czy nie mogę po prostu uruchomić OCR i dostać tekst? Po co geometria?” + +- **Kontekst przestrzenny:** Gdy musisz dopasować pola w formularzu (np. „Data” obok wartości daty), współrzędne mówią, *gdzie* znajdują się dane. +- **Układy wielokolumnowe:** Prosty liniowy tekst traci kolejność; dane strukturalne zachowują kolejność kolumn. +- **Dokładność post‑procesingu:** Znając rozmiar pola, łatwiej określić, czy słowo jest nagłówkiem, przypisem czy przypadkowym artefaktem. + +Krótko mówiąc, **structured OCR recognition** daje elastyczność potrzebną do budowy inteligentniejszych pipeline’ów — niezależnie od tego, czy wprowadzisz dane do bazy, tworzysz przeszukiwalne PDF‑y, czy trenujesz model ML respektujący układ. + +--- + +## Common Edge Cases and How to Handle Them + +| Situation | What to Watch For | Suggested Fix | +|-----------|-------------------|---------------| +| **Rotated or skewed images** | Bounding boxes may be off‑axis. | Pre‑process with deskewing (e.g., OpenCV’s `warpAffine`). | +| **Very small fonts** | Engine may miss characters, leading to empty lines. | Increase image resolution or use `ocr_engine.set_dpi(300)`. | +| **Mixed languages** | Wrong language model can cause garbled text. | Set `ocr_engine.language = ["en", "de"]` before recognition. | +| **Overlapping boxes** | Post‑processor might merge two lines unintentionally. | Verify `line.bounds` after processing; adjust thresholds in `ai.run_postprocessor`. | + +Rozwiązanie tych scenariuszy na wczesnym etapie oszczędza późniejsze problemy, zwłaszcza przy skalowaniu rozwiązania do setek dokumentów dziennie. + +--- + +## Full End‑to‑End Script + +Poniżej pełny, gotowy do uruchomienia program, który łączy wszystkie kroki. Skopiuj‑wklej, dostosuj ścieżkę do obrazu i gotowe. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Uruchomienie tego skryptu spowoduje: + +1. **Run OCR on image** w trybie strukturalnym. +2. **Extract text with coordinates** dla każdej linii. +3. Opcjonalnie wygeneruje oznaczony PNG z zaznaczonymi boxami. + +--- + +## Conclusion + +Masz teraz solidne, samodzielne rozwiązanie do **uruchomienia OCR na obrazie** i **wyodrębniania tekstu z współrzędnymi** przy użyciu **structured OCR recognition**. Kod demonstruje każdy krok — od inicjalizacji silnika, przez post‑processing, po weryfikację wizualną — dzięki czemu możesz go dostosować do paragonów, formularzy czy dowolnych dokumentów wizualnych wymagających precyzyjnej lokalizacji tekstu. + +Co dalej? Spróbuj zamienić silnik `aocr` na inną bibliotekę (Tesseract, EasyOCR) i zobacz, jak różnią się ich wyjścia strukturalne. Eksperymentuj z różnymi strategiami post‑procesingu, takimi jak sprawdzanie pisowni czy własne filtry regex, aby podnieść dokładność w swojej domenie. A jeśli budujesz większy pipeline, rozważ przechowywanie par `(text, bounds)` w bazie danych do późniejszej analizy. + +Miłego kodowania i niech Twoje projekty OCR będą zawsze precyzyjne! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/portuguese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/portuguese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..27876cc48 --- /dev/null +++ b/ocr/portuguese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,230 @@ +--- +category: general +date: 2026-05-03 +description: extrair texto de imagem usando Aspose OCR e correção ortográfica com + IA. Aprenda como fazer OCR em imagem, carregar imagem para OCR, reconhecer texto + de fatura e liberar recursos da GPU. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: pt +og_description: extraia texto de imagem com Aspose OCR e correção ortográfica por + IA. Guia passo a passo cobrindo como fazer OCR em imagem, carregar a imagem para + OCR e liberar recursos da GPU. +og_title: extrair texto de imagem – Guia completo de OCR e verificação ortográfica +tags: +- OCR +- Aspose +- AI +- Python +title: Extrair texto de imagem – OCR com Aspose AI Spell‑Check +url: /pt/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# extrair texto de imagem – Guia Completo de OCR & Spell‑Check + +Já precisou **extrair texto de imagem** mas não sabia qual biblioteca ofereceria velocidade e precisão? Você não está sozinho. Em muitos projetos reais — pense em processamento de faturas, digitalização de recibos ou escaneamento de contratos — obter texto limpo e pesquisável a partir de uma foto é o primeiro obstáculo. + +A boa notícia é que o Aspose OCR combinado com um modelo leve Aspose AI pode fazer esse trabalho em poucas linhas de Python. Neste tutorial vamos percorrer **como fazer OCR em imagem**, carregar a foto corretamente, executar um pós‑processador de correção ortográfica embutido e, por fim, **liberar recursos da GPU** para que seu aplicativo continue econômico em memória. + +Ao final deste guia você será capaz de **reconhecer texto de faturas** em imagens, corrigir automaticamente erros comuns de OCR e manter sua GPU limpa para o próximo lote. + +--- + +## O que você precisará + +- Python 3.9 ou superior (o código usa type hints, mas funciona em versões 3.x anteriores) +- Pacotes `aspose-ocr` e `aspose-ai` (instale via `pip install aspose-ocr aspose-ai`) +- Uma GPU com suporte a CUDA é opcional; o script recairá para CPU se nenhuma for encontrada. +- Uma imagem de exemplo, por exemplo `sample_invoice.png`, colocada em uma pasta que você possa referenciar. + +Sem frameworks pesados de ML, sem downloads massivos de modelos — apenas um pequeno modelo quantizado Q4‑K‑M que cabe confortavelmente na maioria das GPUs. + +--- + +## Etapa 1: Inicializar o OCR Engine – extrair texto de imagem + +A primeira coisa que você faz é criar uma instância de `OcrEngine` e informar qual idioma você espera. Aqui escolhemos inglês e solicitamos saída em texto simples, ideal para processamento posterior. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Por que isso importa:** Definir o idioma restringe o conjunto de caracteres, melhorando a precisão. O modo de texto simples remove informações de layout que você normalmente não precisa quando só quer extrair texto de imagem. + +--- + +## Etapa 2: Carregar imagem para OCR – como fazer OCR em imagem + +Agora alimentamos o engine com uma foto real. O helper `Image.load` entende formatos comuns (PNG, JPEG, TIFF) e abstrai as peculiaridades de I/O de arquivos. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Dica:** Se suas imagens de origem são grandes, considere redimensioná‑las antes de enviá‑las ao engine; dimensões menores podem reduzir o uso de memória da GPU sem prejudicar a qualidade do reconhecimento. + +--- + +## Etapa 3: Configurar o Modelo Aspose AI – reconhecer texto de fatura + +Aspose AI vem com um modelo GGUF pequeno que pode ser baixado automaticamente. O exemplo usa o repositório `Qwen2.5‑3B‑Instruct‑GGUF`, quantizado para `q4_k_m`. Também instruímos o runtime a alocar 20 camadas na GPU, equilibrando velocidade e uso de VRAM. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Nos bastidores:** O modelo quantizado tem cerca de 1,5 GB no disco, uma fração de um modelo de precisão total, mas ainda captura nuance linguística suficiente para identificar erros típicos de OCR. + +--- + +## Etapa 4: Inicializar AsposeAI e anexar o pós‑processador de correção ortográfica + +Aspose AI inclui um pós‑processador de correção ortográfica pronto para uso. Ao anexá‑lo, cada resultado de OCR será limpo automaticamente. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Por que usar o pós‑processador?** Engines de OCR frequentemente leem “Invoice” como “Invo1ce” ou “Total” como “T0tal”. A correção ortográfica executa um modelo de linguagem leve sobre a string bruta e corrige esses erros sem que você precise criar um dicionário personalizado. + +--- + +## Etapa 5: Executar o pós‑processador de correção ortográfica no resultado do OCR + +Com tudo conectado, uma única chamada produz o texto corrigido. Também imprimimos as versões original e limpa para que você veja a melhoria. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Saída típica para uma fatura pode ser assim: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Observe como “Invo1ce” se transformou na palavra correta “Invoice”. Esse é o poder da correção ortográfica AI embutida. + +--- + +## Etapa 6: Liberar recursos da GPU – liberar recursos da GPU com segurança + +Se você estiver executando isso em um serviço de longa duração (por exemplo, uma API web que processa dezenas de faturas por minuto), deve liberar o contexto da GPU após cada lote. Caso contrário, ocorrerão vazamentos de memória e, eventualmente, erros de “CUDA out of memory”. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Dica de especialista:** Chame `free_resources()` dentro de um bloco `finally` ou de um gerenciador de contexto para que ele sempre seja executado, mesmo se ocorrer uma exceção. + +--- + +## Exemplo Completo Funcional + +Juntando todas as peças, você obtém um script autocontido que pode ser inserido em qualquer projeto. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Salve o arquivo, ajuste o caminho para sua imagem e execute `python extract_text_from_image.py`. Você deverá ver o texto da fatura limpo impresso no console. + +--- + +## Perguntas Frequentes (FAQ) + +**Q: Isso funciona em máquinas apenas com CPU?** +A: Absolutamente. Se nenhuma GPU for detectada, Aspose AI recai para execução em CPU, embora seja mais lento. Você pode forçar a CPU definindo `model_cfg.gpu_layers = 0`. + +**Q: E se minhas faturas estiverem em outro idioma que não o inglês?** +A: Altere `ocr_engine.language` para o valor enum apropriado (por exemplo, `aocr.Language.Spanish`). O modelo de correção ortográfica é multilíngue, mas você pode obter melhores resultados com um modelo específico para o idioma. + +**Q: Posso processar várias imagens em um loop?** +A: Sim. Basta mover as etapas de carregamento, reconhecimento e pós‑processamento para dentro de um `for` loop. Lembre‑se de chamar `ocr_ai.free_resources()` após o loop ou após cada lote se estiver reutilizando a mesma instância AI. + +**Q: Qual o tamanho do download do modelo?** +A: Aproximadamente 1,5 GB para a versão quantizada `q4_k_m`. Ele é armazenado em cache após a primeira execução, então execuções subsequentes são instantâneas. + +--- + +## Conclusão + +Neste tutorial demonstramos como **extrair texto de imagem** usando Aspose OCR, configurar um modelo AI pequeno, aplicar um pós‑processador de correção ortográfica e liberar **recursos da GPU** com segurança. O fluxo cobre tudo, desde o carregamento da foto até a limpeza final, oferecendo um pipeline confiável para cenários de **reconhecimento de texto de fatura**. + +Próximos passos? Experimente substituir a correção ortográfica por um modelo personalizado de extração de entidades. + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/portuguese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/portuguese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..adfcb2284 --- /dev/null +++ b/ocr/portuguese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,215 @@ +--- +category: general +date: 2026-05-03 +description: Como fazer OCR em lote de imagens usando Aspose OCR e correção ortográfica + com IA. Aprenda a extrair texto de imagens, aplicar correção ortográfica, usar recursos + de IA gratuitos e corrigir erros de OCR. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: pt +og_description: Como fazer OCR em lote de imagens usando Aspose OCR e correção ortográfica + com IA. Siga um guia passo a passo para extrair texto de imagens, aplicar correção + ortográfica, recursos de IA gratuitos e corrigir erros de OCR. +og_title: Como fazer OCR em lote com Aspose OCR – Tutorial completo em Python +tags: +- OCR +- Python +- AI +- Aspose +title: Como fazer OCR em lote com Aspose OCR – Guia completo em Python +url: /pt/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Como fazer OCR em lote com Aspose OCR – Guia Completo em Python + +Já se perguntou **como fazer OCR em lote** de uma pasta inteira de PDFs escaneados ou fotos sem precisar escrever um script separado para cada arquivo? Você não está sozinho. Em muitos pipelines reais você precisará **extrair texto de imagens**, corrigir erros ortográficos e, por fim, liberar quaisquer recursos de IA que tenha alocado. Este tutorial mostra exatamente como fazer isso com Aspose OCR, um pós‑processador de IA leve, e algumas linhas de Python. + +Vamos percorrer a inicialização do motor OCR, a conexão de um verificador ortográfico de IA, a iteração sobre um diretório de imagens e a limpeza do modelo ao final. Ao final, você terá um script pronto‑para‑executar que **corrige erros de OCR** automaticamente e libera **recursos de IA** para que sua GPU permaneça feliz. + +## O que você vai precisar + +- Python 3.9+ (o código usa type‑hints, mas funciona em versões anteriores 3.x) +- Pacote `asposeocr` (`pip install asposeocr`) – fornece o motor OCR. +- Acesso ao modelo Hugging Face `bartowski/Qwen2.5-3B-Instruct-GGUF` (baixado automaticamente). +- Uma GPU com pelo menos alguns GB de VRAM (o script define `gpu_layers = 30`, você pode reduzir se necessário). + +Sem serviços externos, sem APIs pagas – tudo roda localmente. + +--- + +## Etapa 1: Configurar o Motor OCR – **Como fazer OCR em lote** de forma eficiente + +Antes de processar mil imagens, precisamos de um motor OCR sólido. O Aspose OCR permite escolher idioma e modo de reconhecimento em uma única chamada. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Por que isso importa:** Definir `recognize_mode` como `Plain` mantém a saída leve, o que é ideal quando você planeja executar uma verificação ortográfica depois. Se precisar de informações de layout, troque para `Layout`, mas isso adiciona overhead que provavelmente você não quer em um job em lote. + +> **Dica de especialista:** Se estiver lidando com digitalizações multilíngues, pode passar uma lista como `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Etapa 2: Inicializar o Pós‑processador de IA – **Aplicar verificação ortográfica** ao output do OCR + +O Aspose AI vem com um pós‑processador embutido que pode rodar qualquer modelo que você desejar. Aqui puxamos um modelo Qwen 2.5 quantizado do Hugging Face e conectamos a rotina de verificação ortográfica. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Por que isso importa:** O modelo está quantizado (`q4_k_m`), o que reduz drasticamente o uso de memória enquanto ainda oferece compreensão de linguagem decente. Ao chamar `set_post_processor` informamos ao Aspose AI para executar a etapa **apply spell check** automaticamente em qualquer string que enviarmos. + +> **Atenção:** Se sua GPU não conseguir lidar com 30 camadas, diminua o número para 15 ou até 5 – o script ainda funcionará, apenas um pouco mais lento. + +--- + +## Etapa 3: Executar OCR e **Corrigir erros de OCR** em uma única imagem + +Com o motor OCR e o verificador ortográfico de IA prontos, os combinamos. Esta função carrega uma imagem, extrai o texto bruto e, em seguida, executa o pós‑processador de IA para limpá‑lo. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Por que isso importa:** Alimentar diretamente a string OCR bruta ao modelo de IA nos dá uma passagem **correct OCR errors** sem precisar escrever regexes ou dicionários personalizados. O modelo entende o contexto, podendo corrigir “recieve” → “receive” e erros ainda mais sutis. + +--- + +## Etapa 4: **Extrair texto de imagens** em massa – O loop real de lote + +Aqui é onde a magia de **como fazer OCR em lote** brilha. Iteramos sobre um diretório, ignoramos arquivos não suportados e gravamos cada output corrigido em um arquivo `.txt`. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Saída esperada + +Para uma imagem contendo a frase *“The quick brown fox jumps over the lazzy dog.”* você verá um arquivo de texto com: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Observe que o “zz” duplo foi corrigido automaticamente – esse é o verificador ortográfico de IA em ação. + +**Por que isso importa:** Ao criar os objetos OCR e IA **uma única vez** e reutilizá‑los, evitamos o overhead de carregar o modelo para cada arquivo. Essa é a forma mais eficiente de **como fazer OCR em lote** em escala. + +--- + +## Etapa 5: Limpeza – **Liberar recursos de IA** corretamente + +Quando terminar, chamar `free_resources()` libera a memória da GPU, contextos CUDA e quaisquer arquivos temporários que o modelo criou. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Pular essa etapa pode deixar alocações de GPU pendentes, o que pode travar processos Python subsequentes ou consumir VRAM. Pense nisso como a parte “desligar as luzes” de um job em lote. + +--- + +## Armadilhas comuns & Dicas extras + +| Problema | O que observar | Solução | +|----------|----------------|---------| +| **Erros de falta de memória** | GPU esgota após algumas dezenas de imagens | Reduza `gpu_layers` ou troque para CPU (`model_cfg.gpu_layers = 0`). | +| **Pacote de idioma ausente** | OCR retorna strings vazias | Garanta que a versão do `asposeocr` inclua os dados de idioma inglês; reinstale se necessário. | +| **Arquivos não‑imagem** | Script falha ao encontrar um `.pdf` inesperado | A verificação `if not file_name.lower().endswith(...)` já os ignora. | +| **Verificação ortográfica não aplicada** | Output idêntico ao OCR bruto | Verifique se `ai_processor.set_post_processor` foi chamado antes do loop. | +| **Velocidade lenta** | Demora >5 segundos por imagem | Ative `model_cfg.allow_auto_download = "false"` após a primeira execução, para evitar re‑download do modelo. | + +**Dica de especialista:** Se precisar **extrair texto de imagens** em um idioma diferente do inglês, basta mudar `ocr_engine.language` para o enum apropriado (ex.: `aocr.Language.French`). O mesmo pós‑processador de IA ainda aplicará a verificação ortográfica, mas você pode querer um modelo específico para o idioma para obter melhores resultados. + +--- + +## Recapitulação & Próximos passos + +Cobremos todo o pipeline para **como fazer OCR em lote**: + +1. **Inicializar** um motor OCR de texto simples para inglês. +2. **Configurar** um modelo de verificação ortográfica de IA e vinculá‑lo como pós‑processador. +3. **Executar** OCR em cada imagem e deixar a IA **corrigir erros de OCR** automaticamente. +4. **Iterar** sobre um diretório para **extrair texto de imagens** em massa. +5. **Liberar recursos de IA** ao final do job. + +A partir daqui você pode: + +- Encaminhar o texto corrigido para um pipeline NLP downstream (análise de sentimento, extração de entidades, etc.). +- Trocar o pós‑processador de verificação ortográfica por um resumidor customizado chamando `ai_processor.set_post_processor(seu_func_custom, {})`. +- Paralelizar o loop da pasta com `concurrent.futures.ThreadPoolExecutor` se sua GPU suportar múltiplos streams. + +--- + +## Considerações finais + +Fazer OCR em lote não precisa ser um fardo. Ao combinar Aspose OCR com um modelo de IA leve, você obtém uma **solução tudo‑em‑um** que **extrai texto de imagens**, **aplica verificação ortográfica**, **corrige erros de OCR** e **libera recursos de IA** de forma limpa. Experimente o script em uma pasta de teste, ajuste a contagem de camadas da GPU conforme seu hardware e você terá um pipeline pronto para produção em minutos. + +Tem dúvidas sobre ajustar o modelo, lidar com PDFs ou integrar isso a um serviço web? Deixe um comentário abaixo ou me chame no GitHub. Boa codificação, e que seu OCR seja sempre preciso! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/portuguese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/portuguese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..68eeeacb9 --- /dev/null +++ b/ocr/portuguese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,300 @@ +--- +category: general +date: 2026-05-03 +description: Tutorial de OCR em Python que mostra como carregar arquivos de imagem + PNG, reconhecer texto da imagem e recursos de IA gratuitos para processamento em + lote de OCR. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: pt +og_description: Tutorial de OCR em Python orienta você a carregar imagens PNG, reconhecer + texto da imagem e lidar com recursos de IA gratuitos para processamento em lote + de OCR. +og_title: Tutorial de OCR em Python – OCR em lote rápido com recursos de IA gratuitos +tags: +- OCR +- Python +- AI +title: Tutorial de OCR em Python – Processamento em Lote de OCR Facilitado +url: /pt/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR Tutorial – Processamento em Lote de OCR Facilitado + +Já precisou de um **python ocr tutorial** que realmente permita executar OCR em dezenas de arquivos PNG sem perder a cabeça? Você não está sozinho. Em muitos projetos do mundo real você tem que **load png image** arquivos, alimentá‑los a um motor e, em seguida, limpar os recursos de IA quando terminar. + +Neste guia vamos percorrer um exemplo completo, pronto‑para‑executar, que mostra exatamente como **recognize text from image** arquivos, processá‑los em lote e liberar a memória de IA subjacente. Ao final você terá um script autônomo que pode ser inserido em qualquer projeto — sem enrolação extra, apenas o essencial. + +## O Que Você Precisa + +- Python 3.10 ou superior (a sintaxe usada aqui depende de f‑strings e type hints) +- Uma biblioteca de OCR que exponha um método `engine.recognize` – para demonstração assumiremos um pacote fictício `aocr`, mas você pode substituir por Tesseract, EasyOCR, etc. +- O módulo auxiliar `ai` mostrado no trecho de código (ele cuida da inicialização do modelo e da limpeza de recursos) +- Uma pasta cheia de arquivos PNG que você deseja processar + +Se você não tem `aocr` ou `ai` instalados, pode simulá‑los com stubs – veja a seção “Stubs Opcionais” perto do final. + +## Etapa 1: Inicializar o Motor de IA (Free AI Resources) + +Antes de alimentar qualquer imagem ao pipeline de OCR, o modelo subjacente precisa estar pronto. Inicializar apenas uma vez economiza memória e acelera trabalhos em lote. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Por que isso importa:** +Chamar `ai.initialize` repetidamente para cada imagem alocaria memória de GPU várias vezes, eventualmente travando o script. Ao verificar `ai.is_initialized()` garantimos uma única alocação – esse é o princípio de “Free AI Resources”. + +## Etapa 2: Carregar Arquivos PNG para Processamento em Lote de OCR + +Agora reunimos todos os arquivos PNG que queremos passar pelo OCR. Usar `pathlib` mantém o código independente do SO. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Caso extremo:** +Se a pasta contiver arquivos que não sejam PNG (por exemplo, JPEGs) eles serão ignorados, evitando que `engine.recognize` falhe ao encontrar um formato não suportado. + +## Etapa 3: Executar OCR em Cada Imagem e Aplicar Pós‑Processamento + +Com o motor pronto e a lista de arquivos preparada, podemos iterar sobre as imagens, extrair o texto bruto e entregá‑lo a um pós‑processador que limpa artefatos comuns de OCR (como quebras de linha indesejadas). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Por que separamos carregamento de reconhecimento:** +`aocr.Image.load` pode fazer decodificação preguiçosa, o que é mais rápido para grandes lotes. Manter a etapa de carregamento explícita também facilita trocar por outra biblioteca de imagens caso você precise lidar com JPEG ou TIFF no futuro. + +## Etapa 4: Limpeza – Free AI Resources Após o Lote + +Quando o lote terminar, devemos liberar o modelo para evitar vazamentos de memória, especialmente em máquinas com GPU. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Juntando Tudo – O Script Completo + +A seguir está um único arquivo que une as quatro etapas em um fluxo coeso. Salve como `batch_ocr.py` e execute pelo terminal. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Saída Esperada + +Executar o script em uma pasta contendo três PNGs pode imprimir: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +O arquivo `ocr_results.txt` conterá um delimitador claro para cada imagem seguido do texto OCR limpo. + +## Stubs Opcionais para aocr & ai (Caso Você Não Tenha Pacotes Reais) + +Se você só quer testar o fluxo sem trazer bibliotecas pesadas de OCR, pode criar módulos mock mínimos: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Coloque essas pastas ao lado de `batch_ocr.py` e o script será executado, imprimindo resultados simulados. + +## Dicas Profissionais & Armadilhas Comuns + +- **Picos de memória:** Se você estiver processando milhares de PNGs de alta resolução, considere redimensioná‑los antes do OCR. `aocr.Image.load` costuma aceitar um argumento `max_size`. +- **Manipulação de Unicode:** Sempre abra o arquivo de saída com `encoding="utf-8"`; motores de OCR podem gerar caracteres não‑ASCII. +- **Paralelismo:** Para OCR limitado por CPU você pode envolver `ocr_batch` em um `concurrent.futures.ThreadPoolExecutor`. Apenas lembre‑se de manter uma única instância de `ai` – criar várias threads que cada uma chama `ai.initialize` anula o objetivo de “Free AI Resources”. +- **Resiliência a erros:** Envolva o loop por imagem em um bloco `try/except` para que um PNG corrompido não interrompa todo o lote. + +## Conclusão + +Agora você tem um **python ocr tutorial** que demonstra como **load png image** arquivos, executar **batch OCR processing** e gerenciar responsavelmente **Free AI Resources**. O exemplo completo e executável mostra exatamente como **recognize text from image** objetos e limpar os recursos depois, para que você possa copiar‑colar em seus próprios projetos sem buscar peças faltantes. + +Pronto para o próximo passo? Experimente substituir os módulos stub `aocr` e `ai` por bibliotecas reais como `pytesseract` e `torchvision`. Você também pode estender o script para gerar JSON, enviar resultados a um banco de dados ou integrar com um bucket de armazenamento na nuvem. O céu é o limite — feliz codificação! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/portuguese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/portuguese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..62d3b6517 --- /dev/null +++ b/ocr/portuguese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: Aprenda a executar OCR em imagens e extrair texto com coordenadas usando + reconhecimento OCR estruturado. Código Python passo a passo incluído. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: pt +og_description: Execute OCR em imagem e obtenha o texto com coordenadas usando reconhecimento + OCR estruturado. Exemplo completo em Python com explicações. +og_title: Execute OCR em imagem – Tutorial de Extração de Texto Estruturado +tags: +- OCR +- Python +- Computer Vision +title: Execute OCR em imagem – Guia completo para extração de texto estruturado +url: /pt/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Executar OCR em imagem – Guia Completo para Extração de Texto Estruturado + +Já precisou **executar OCR em imagem** mas não sabia como manter as posições exatas de cada palavra? Você não está sozinho. Em muitos projetos—digitalização de recibos, digitalização de formulários ou teste de UI—você precisa não apenas do texto bruto, mas também das caixas delimitadoras que indicam onde cada linha está na imagem. + +Este tutorial mostra uma maneira prática de *executar OCR em imagem* usando o motor **aocr**, solicitar **reconhecimento OCR estruturado**, e então pós‑processar o resultado preservando a geometria. Ao final, você será capaz de **extrair texto com coordenadas** em apenas algumas linhas de Python, e entenderá por que o modo estruturado importa para tarefas posteriores. + +## O que você aprenderá + +- Como inicializar o motor OCR para **reconhecimento OCR estruturado**. +- Como fornecer uma imagem e receber resultados brutos que incluem limites de linha. +- Como executar um pós‑processador que limpa o texto sem perder a geometria. +- Como iterar sobre as linhas finais e imprimir cada trecho de texto junto com sua caixa delimitadora. + +Sem mágica, sem passos ocultos—apenas um exemplo completo e executável que você pode inserir no seu próprio projeto. + +--- + +## Pré-requisitos + +Antes de mergulharmos, certifique‑se de que você tem o seguinte instalado: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Você também precisará de um arquivo de imagem (`input_image.png` ou `.jpg`) que contenha texto claro e legível. Qualquer coisa, desde uma fatura escaneada até uma captura de tela, funciona, contanto que o motor OCR consiga ver os caracteres. + +--- + +## Etapa 1: Inicializar o motor OCR para reconhecimento estruturado + +A primeira coisa que fazemos é criar uma instância de `aocr.Engine()` e dizer que queremos **reconhecimento OCR estruturado**. O modo estruturado devolve não apenas o texto simples, mas também dados geométricos (retângulos delimitadores) para cada linha, o que é essencial quando você precisa mapear o texto de volta à imagem. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Por que isso importa:** +> No modo padrão o motor pode apenas devolver uma string de palavras concatenadas. O modo estruturado fornece uma hierarquia de páginas → linhas → palavras, cada uma com coordenadas, facilitando muito sobrepor os resultados na imagem original ou alimentá‑los em um modelo consciente de layout. + +--- + +## Etapa 2: Executar OCR na imagem e obter resultados brutos + +Agora alimentamos a imagem ao motor. A chamada `recognize` devolve um objeto `OcrResult` que contém uma coleção de linhas, cada uma com seu próprio retângulo delimitador. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +Neste ponto `raw_result.lines` contém objetos com dois atributos importantes: + +- `text` – a string reconhecida para essa linha. +- `bounds` – uma tupla como `(x, y, width, height)` descrevendo a posição da linha. + +--- + +## Etapa 3: Pós‑processar preservando a geometria + +A saída bruta do OCR costuma ser ruidosa: caracteres soltos, espaços fora de lugar ou problemas de quebras de linha. A função `ai.run_postprocessor` limpa o texto mas **mantém a geometria original** intacta, de modo que você ainda tem coordenadas precisas. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Dica profissional:** Se você tem vocabulários específicos de domínio (por exemplo, códigos de produto), forneça um dicionário personalizado ao pós‑processador para melhorar a precisão. + +--- + +## Etapa 4: Extrair texto com coordenadas – iterar e exibir + +Finalmente, percorremos as linhas limpas, imprimindo a caixa delimitadora de cada linha ao lado do seu texto. Este é o núcleo de **extrair texto com coordenadas**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Saída esperada + +Assumindo que a imagem de entrada contém duas linhas: “Invoice #12345” e “Total: $89.99”, você verá algo como: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +A primeira tupla é o `(x, y, width, height)` da linha na imagem original, permitindo que você desenhe retângulos, destaque texto ou alimente as coordenadas em outro sistema. + +--- + +## Visualizando o Resultado (Opcional) + +Se quiser ver as caixas delimitadoras sobrepostas na imagem, pode usar Pillow (PIL) para desenhar retângulos. A seguir um trecho rápido; sinta‑se à vontade para pular se precisar apenas dos dados brutos. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![exemplo de execução de OCR em imagem mostrando caixas delimitadoras](/images/ocr-bounding-boxes.png "execução de OCR em imagem – sobreposição de caixas delimitadoras") + +O texto alternativo acima contém a **palavra‑chave principal**, atendendo ao requisito de SEO para atributos alt de imagens. + +--- + +## Por que o Reconhecimento OCR Estruturado supera a Extração de Texto Simples + +Você pode se perguntar: “Não dá para simplesmente executar OCR e obter o texto? Por que se preocupar com a geometria?” + +- **Contexto espacial:** Quando você precisa mapear campos em um formulário (por exemplo, “Date” ao lado de um valor de data), as coordenadas indicam *onde* os dados estão. +- **Layouts de múltiplas colunas:** Texto linear simples perde a ordem; dados estruturados preservam a ordem das colunas. +- **Precisão no pós‑processamento:** Conhecer o tamanho da caixa ajuda a decidir se uma palavra é um cabeçalho, uma nota de rodapé ou um artefato solto. + +Em resumo, **reconhecimento OCR estruturado** oferece a flexibilidade para construir pipelines mais inteligentes—seja alimentando dados em um banco de dados, criando PDFs pesquisáveis ou treinando um modelo de machine‑learning que respeita o layout. + +--- + +## Casos de Borda Comuns e Como Lidar com Eles + +| Situação | O que observar | Correção sugerida | +|-----------|-------------------|---------------| +| **Imagens rotacionadas ou inclinadas** | Caixas delimitadoras podem estar fora do eixo. | Pré‑processar com correção de inclinação (por exemplo, `warpAffine` do OpenCV). | +| **Fontes muito pequenas** | O motor pode perder caracteres, resultando em linhas vazias. | Aumente a resolução da imagem ou use `ocr_engine.set_dpi(300)`. | +| **Línguas misturadas** | Modelo de idioma errado pode gerar texto confuso. | Defina `ocr_engine.language = ["en", "de"]` antes do reconhecimento. | +| **Caixas sobrepostas** | O pós‑processador pode mesclar duas linhas inadvertidamente. | Verifique `line.bounds` após o processamento; ajuste os limites em `ai.run_postprocessor`. | + +Abordar esses cenários cedo evita dores de cabeça depois, especialmente quando você escala a solução para centenas de documentos por dia. + +--- + +## Script Completo de Ponta a Ponta + +Abaixo está o programa completo, pronto‑para‑executar, que une todas as etapas. Copie‑e‑cole, ajuste o caminho da imagem, e pronto. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Executar este script irá: + +1. **Executar OCR em imagem** com modo estruturado. +2. **Extrair texto com coordenadas** para cada linha. +3. Opcionalmente gerar um PNG anotado mostrando as caixas. + +--- + +## Conclusão + +Agora você tem uma solução sólida e autônoma para **executar OCR em imagem** e **extrair texto com coordenadas** usando **reconhecimento OCR estruturado**. O código demonstra cada passo—from inicialização do motor ao pós‑processamento e verificação visual—para que você possa adaptá‑lo a recibos, formulários ou qualquer documento visual que precise de localização precisa de texto. + +Qual o próximo passo? Experimente trocar o motor `aocr` por outra biblioteca (Tesseract, EasyOCR) e veja como as saídas estruturadas diferem. Experimente estratégias diferentes de pós‑processamento, como correção ortográfica ou filtros regex personalizados, para aumentar a precisão no seu domínio. E se estiver construindo um pipeline maior, considere armazenar os pares `(text, bounds)` em um banco de dados para análises futuras. + +Feliz codificação, e que seus projetos de OCR sejam sempre precisos! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/russian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/russian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..ebd9e24fd --- /dev/null +++ b/ocr/russian/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,231 @@ +--- +category: general +date: 2026-05-03 +description: Извлекать текст из изображения с помощью Aspose OCR и AI‑проверки орфографии. + Узнайте, как выполнять OCR изображения, загружать изображение для OCR, распознавать + текст из счета и освобождать ресурсы GPU. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: ru +og_description: Извлекать текст из изображения с помощью Aspose OCR и AI‑проверки + орфографии. Пошаговое руководство, охватывающее, как выполнять OCR изображения, + загружать изображение для OCR и освобождать ресурсы GPU. +og_title: Извлечение текста из изображения – Полное руководство по OCR и проверке + орфографии +tags: +- OCR +- Aspose +- AI +- Python +title: Извлечение текста из изображения — OCR с Aspose AI Spell‑Check +url: /ru/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# извлечение текста из изображения – Полное руководство по OCR и проверке орфографии + +Когда‑то вам нужно было **извлечь текст из изображения**, но вы не знали, какая библиотека обеспечит и скорость, и точность? Вы не одиноки. Во многих реальных проектах — будь то обработка счетов, оцифровка чеков или сканирование контрактов — получение чистого, поискового текста с картинки является первой преградой. + +Хорошая новость: Aspose OCR в паре с лёгкой моделью Aspose AI справится с этой задачей в несколько строк кода на Python. В этом руководстве мы пройдёмся по **как выполнить OCR изображения**, правильно загрузим картинку, запустим встроенный пост‑процессор проверки орфографии и, наконец, **освободим ресурсы GPU**, чтобы приложение оставалось экономным по памяти. + +К концу этого руководства вы сможете **распознавать текст со счетов**, автоматически исправлять типичные ошибки OCR и поддерживать ваш GPU в чистоте для следующей партии. + +--- + +## Что понадобится + +- Python 3.9 или новее (код использует type hints, но работает и в более ранних версиях 3.x) +- пакеты `aspose-ocr` и `aspose-ai` (устанавливаются через `pip install aspose-ocr aspose-ai`) +- GPU с поддержкой CUDA — опционально; при отсутствии скрипт переключится на CPU. +- Пример изображения, например `sample_invoice.png`, размещённый в папке, к которой вы можете обратиться. + +Никаких тяжёлых ML‑фреймворков, никаких массивных загрузок моделей — только небольшая Q4‑K‑M квантизированная модель, которая удобно помещается на большинстве GPU. + +--- + +## Шаг 1: Инициализация OCR‑движка – extract text from image + +Первое, что нужно сделать, — создать экземпляр `OcrEngine` и указать ожидаемый язык. Здесь мы выбираем английский и запрашиваем вывод в виде простого текста, что идеально подходит для последующей обработки. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Почему это важно:** Указание языка сужает набор символов, повышая точность. Режим простого текста удаляет информацию о макете, которая обычно не нужна, когда нужно просто извлечь текст из изображения. + +--- + +## Шаг 2: Загрузка изображения для OCR – how to OCR image + +Теперь передаём движку реальную картинку. Помощник `Image.load` понимает распространённые форматы (PNG, JPEG, TIFF) и абстрагирует особенности файлового ввода‑вывода. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Совет:** Если исходные изображения большие, подумайте о их масштабировании перед передачей в движок; меньшие размеры могут снизить потребление памяти GPU без потери качества распознавания. + +--- + +## Шаг 3: Настройка модели Aspose AI – recognize text from invoice + +Aspose AI поставляется с крошечной моделью GGUF, которую можно автоматически загрузить. В примере используется репозиторий `Qwen2.5‑3B‑Instruct‑GGUF`, квантизированный до `q4_k_m`. Мы также указываем runtime выделить 20 слоёв на GPU, что балансирует скорость и использование VRAM. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Что происходит за кулисами:** Квантизированная модель занимает примерно 1,5 ГБ на диске, что является лишь частью полной модели, но всё‑равно сохраняет достаточную языковую нюансировку для выявления типичных опечаток OCR. + +--- + +## Шаг 4: Инициализация AsposeAI и подключение пост‑процессора проверки орфографии + +Aspose AI включает готовый пост‑процессор проверки орфографии. Подключив его, каждый результат OCR будет автоматически очищен. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Зачем нужен пост‑процессор?** OCR‑движки часто читают «Invoice» как «Invo1ce» или «Total» как «T0tal». Проверка орфографии запускает лёгкую языковую модель над полученной строкой и исправляет такие ошибки без необходимости писать собственный словарь. + +--- + +## Шаг 5: Запуск пост‑процессора проверки орфографии над результатом OCR + +Когда всё соединено, один вызов выдаёт исправленный текст. Мы также выводим как оригинальную, так и очищенную версии, чтобы вы могли увидеть улучшение. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Типичный вывод для счета может выглядеть так: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Обратите внимание, как «Invo1ce» превратилось в правильное слово «Invoice». Это сила встроенной AI‑проверки орфографии. + +--- + +## Шаг 6: Освобождение ресурсов GPU – release gpu resources safely + +Если вы запускаете это в длительно работающем сервисе (например, веб‑API, обрабатывающем десятки счетов в минуту), необходимо освобождать контекст GPU после каждой партии. Иначе появятся утечки памяти и в конце концов ошибки «CUDA out of memory». + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Профессиональный совет:** Вызывайте `free_resources()` внутри блока `finally` или контекстного менеджера, чтобы он всегда исполнялся, даже при возникновении исключения. + +--- + +## Полный рабочий пример + +Собрав все части вместе, вы получаете автономный скрипт, который можно вставить в любой проект. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Сохраните файл, поправьте путь к вашему изображению и запустите `python extract_text_from_image.py`. Вы должны увидеть очищенный текст счета, выведенный в консоль. + +--- + +## Часто задаваемые вопросы (FAQ) + +**Q: Работает ли это на машинах без GPU?** +A: Абсолютно. Если GPU не обнаружен, Aspose AI переключается на выполнение на CPU, хотя будет работать медленнее. Вы можете принудительно задать CPU, установив `model_cfg.gpu_layers = 0`. + +**Q: Что если мои счета на другом языке, не на английском?** +A: Измените `ocr_engine.language` на соответствующее значение enum (например, `aocr.Language.Spanish`). Модель проверки орфографии многоязычная, но результаты могут быть лучше с языково‑специфичной моделью. + +**Q: Можно ли обрабатывать несколько изображений в цикле?** +A: Да. Просто перенесите шаги загрузки, распознавания и пост‑обработки внутрь цикла `for`. Не забудьте вызвать `ocr_ai.free_resources()` после цикла или после каждой партии, если переиспользуете один и тот же экземпляр AI. + +**Q: Какой размер загрузки модели?** +A: Около 1,5 ГБ для квантизированной версии `q4_k_m`. После первого запуска она кэшируется, так что последующие исполнения происходят мгновенно. + +--- + +## Заключение + +В этом руководстве мы продемонстрировали, как **извлечь текст из изображения** с помощью Aspose OCR, настроить крошечную AI‑модель, применить пост‑процессор проверки орфографии и безопасно **освободить ресурсы GPU**. Рабочий процесс охватывает всё — от загрузки картинки до очистки после себя, предоставляя надёжный конвейер для сценариев **распознавания текста со счетов**. + +Следующий шаг? Попробуйте заменить проверку орфографии на пользовательскую модель извлечения сущностей. + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/russian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/russian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..60cc3ec64 --- /dev/null +++ b/ocr/russian/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,216 @@ +--- +category: general +date: 2026-05-03 +description: Как пакетно выполнять OCR изображений с использованием Aspose OCR и AI‑проверки + орфографии. Узнайте, как извлекать текст из изображений, применять проверку орфографии, + использовать бесплатные AI‑ресурсы и исправлять ошибки OCR. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: ru +og_description: Как пакетно выполнять OCR изображений с помощью Aspose OCR и AI‑проверки + орфографии. Следуйте пошаговому руководству, чтобы извлекать текст из изображений, + применять проверку орфографии, использовать бесплатные AI‑ресурсы и исправлять ошибки + OCR. +og_title: Как выполнять пакетное OCR с Aspose OCR — Полный учебник по Python +tags: +- OCR +- Python +- AI +- Aspose +title: Как пакетно выполнять OCR с Aspose OCR – Полное руководство по Python +url: /ru/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Как выполнять пакетный OCR с Aspose OCR – Полное руководство на Python + +Когда‑нибудь задумывались **как выполнять пакетный OCR** целой папки отсканированных PDF‑файлов или фотографий без написания отдельного скрипта для каждого файла? Вы не одиноки. Во многих реальных конвейерах вам понадобится **извлекать текст из изображений**, исправлять орфографические ошибки и в конце освобождать любые выделенные AI‑ресурсы. Это руководство покажет, как сделать всё это с помощью Aspose OCR, лёгкого AI‑постпроцессора, и нескольких строк кода на Python. + +Мы пройдём процесс инициализации OCR‑движка, подключения AI‑проверки орфографии, обхода каталога с изображениями и очистки модели после завершения. К концу вы получите готовый к запуску скрипт, который **исправляет ошибки OCR** автоматически и **освобождает AI‑ресурсы**, чтобы ваш GPU оставался довольным. + +## Что понадобится + +- Python 3.9+ (код использует type‑hints, но работает и в более ранних версиях 3.x) +- пакет `asposeocr` (`pip install asposeocr`) – предоставляет OCR‑движок. +- Доступ к модели Hugging Face `bartowski/Qwen2.5-3B-Instruct-GGUF` (скачивается автоматически). +- GPU с минимум несколькими ГБ видеопамяти (скрипт задаёт `gpu_layers = 30`, при необходимости можно уменьшить). + +Никаких внешних сервисов, никаких платных API – всё работает локально. + +--- + +## Шаг 1: Настройка OCR‑движка – **Как выполнять пакетный OCR** эффективно + +Прежде чем обработать тысячу изображений, нам нужен надёжный OCR‑движок. Aspose OCR позволяет выбрать язык и режим распознавания одним вызовом. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Почему это важно:** Установка `recognize_mode` в `Plain` делает вывод лёгким, что идеально, если позже планируется проверка орфографии. Если нужны сведения о макете, переключитесь на `Layout`, но это добавит накладные расходы, которые обычно не нужны в пакетной задаче. + +> **Pro tip:** Если вы работаете с многоязычными сканами, можно передать список, например `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Шаг 2: Инициализация AI‑постпроцессора – **Применить проверку орфографии** к результатам OCR + +Aspose AI поставляется со встроенным постпроцессором, способным запускать любую модель. Здесь мы загружаем квантизированную модель Qwen 2.5 из Hugging Face и подключаем процедуру проверки орфографии. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Почему это важно:** Модель квантизирована (`q4_k_m`), что существенно экономит память, при этом сохраняет достаточное понимание языка. Вызвав `set_post_processor`, мы говорим Aspose AI автоматически выполнять шаг **применить проверку орфографии** для любой переданной строки. + +> **Watch out:** Если ваш GPU не справляется с 30‑ю слоями, уменьшите число до 15 или даже 5 – скрипт всё равно будет работать, просто медленнее. + +--- + +## Шаг 3: Выполнение OCR и **исправление ошибок OCR** на одном изображении + +Теперь, когда OCR‑движок и AI‑проверка орфографии готовы, мы объединяем их. Эта функция загружает изображение, извлекает сырой текст, а затем запускает AI‑постпроцессор для очистки. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Почему это важно:** Передача сырой строки OCR напрямую в AI‑модель даёт проход **исправление ошибок OCR** без написания регексов или кастомных словарей. Модель учитывает контекст, поэтому может исправить «recieve» → «receive» и более тонкие ошибки. + +--- + +## Шаг 4: **Извлечение текста из изображений** пакетно – реальный цикл пакетной обработки + +Здесь проявляется магия **как выполнять пакетный OCR**. Мы проходим по каталогу, пропускаем неподдерживаемые файлы и сохраняем каждое исправленное содержимое в файл `.txt`. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Ожидаемый вывод + +Для изображения, содержащего предложение *«The quick brown fox jumps over the lazzy dog.»* вы получите текстовый файл со следующим содержимым: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Обратите внимание, двойная «z» исправилась автоматически – это действие AI‑проверки орфографии. + +**Почему это важно:** Создавая объекты OCR и AI **один раз** и переиспользуя их, мы избегаем накладных расходов на загрузку модели для каждого файла. Это самый эффективный способ **как выполнять пакетный OCR** в масштабе. + +--- + +## Шаг 5: Очистка – **освобождение AI‑ресурсов** корректно + +Когда работа завершена, вызов `free_resources()` освобождает видеопамять GPU, контексты CUDA и любые временные файлы, созданные моделью. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Пропуск этого шага может оставить «висящие» выделения GPU, что может привести к сбоям последующих процессов Python или к исчерпанию видеопамяти. Считайте это «выключением света» в пакетной задаче. + +--- + +## Распространённые проблемы и дополнительные советы + +| Проблема | На что обратить внимание | Решение | +|----------|--------------------------|---------| +| **Ошибки нехватки памяти** | GPU заканчивается после нескольких десятков изображений | Уменьшите `gpu_layers` или переключитесь на CPU (`model_cfg.gpu_layers = 0`). | +| **Отсутствует языковой пакет** | OCR возвращает пустые строки | Убедитесь, что версия `asposeocr` включает данные английского языка; при необходимости переустановите. | +| **Неизображения файлы** | Скрипт падает при случайном `.pdf` | Условие `if not file_name.lower().endswith(...)` уже пропускает их. | +| **Проверка орфографии не применена** | Вывод выглядит идентичным сырым OCR | Убедитесь, что `ai_processor.set_post_processor` был вызван до цикла. | +| **Низкая скорость пакетной обработки** | Занимает более 5 секунд на изображение | Включите `model_cfg.allow_auto_download = "false"` после первого запуска, чтобы модель не скачивалась каждый раз. | + +**Pro tip:** Если нужно **извлекать текст из изображений** на языке, отличном от английского, просто измените `ocr_engine.language` на соответствующий enum (например, `aocr.Language.French`). Тот же AI‑постпроцессор всё равно будет выполнять проверку орфографии, но для наилучших результатов может потребоваться модель, специфичная для выбранного языка. + +--- + +## Итоги и дальнейшие шаги + +Мы рассмотрели весь конвейер для **как выполнять пакетный OCR**: + +1. **Инициализировать** OCR‑движок для простого текста на английском. +2. **Настроить** модель AI‑проверки орфографии и привязать её как постпроцессор. +3. **Запустить** OCR на каждом изображении и позволить AI **исправлять ошибки OCR** автоматически. +4. **Обойти** каталог, чтобы **извлекать текст из изображений** пакетно. +5. **Освободить AI‑ресурсы** после завершения работы. + +Отсюда вы можете: + +- Передать исправленный текст в downstream‑конвейер NLP (анализ тональности, извлечение сущностей и т.д.). +- Заменить постпроцессор проверки орфографии на кастомный суммаризатор, вызвав `ai_processor.set_post_processor(your_custom_func, {})`. +- Параллелизовать цикл по папке с помощью `concurrent.futures.ThreadPoolExecutor`, если ваш GPU справится с несколькими потоками. + +--- + +## Заключительные мысли + +Пакетный OCR не обязан быть тяжёлой задачей. Используя Aspose OCR совместно с лёгкой AI‑моделью, вы получаете **универсальное решение**, которое **извлекает текст из изображений**, **применяет проверку орфографии**, **исправляет ошибки OCR** и **чисто освобождает AI‑ресурсы**. Попробуйте скрипт на тестовой папке, подкорректируйте количество GPU‑слоёв под ваше оборудование, и у вас будет готовый к продакшну конвейер за считанные минуты. + +Есть вопросы по настройке модели, работе с PDF‑файлами или интеграции в веб‑сервис? Оставляйте комментарий ниже или пишите мне на GitHub. Приятного кодинга, и пусть ваш OCR будет всегда точным! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/russian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/russian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..3f3a3cfc1 --- /dev/null +++ b/ocr/russian/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,300 @@ +--- +category: general +date: 2026-05-03 +description: Учебник по OCR на Python, показывающий, как загружать PNG‑изображения, + распознавать текст на изображении и использовать бесплатные AI‑ресурсы для пакетной + обработки OCR. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: ru +og_description: Учебник по OCR на Python проведет вас через загрузку PNG‑изображений, + распознавание текста на изображении и работу с бесплатными AI‑ресурсами для пакетной + обработки OCR. +og_title: Учебник по OCR на Python – Быстрый пакетный OCR с бесплатными AI‑ресурсами +tags: +- OCR +- Python +- AI +title: Учебник по OCR на Python — Пакетная обработка OCR стала простой +url: /ru/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR Tutorial – Batch OCR Processing Made Easy + +Когда‑то вам нужен **python ocr tutorial**, который действительно позволяет выполнять OCR для десятков PNG‑файлов, не теряя волосы? Вы не одиноки. В реальных проектах часто требуется **load png image** файлы, передать их движку и потом освободить AI‑ресурсы, когда работа завершена. + +В этом руководстве мы пройдем через полностью готовый к запуску пример, показывающий, как **recognize text from image** файлов, обрабатывать их пакетно и освобождать память AI. К концу вы получите автономный скрипт, который можно вставить в любой проект — без лишних деталей, только самое необходимое. + +## What You’ll Need + +- Python 3.10 или новее (используемый синтаксис опирается на f‑strings и type hints) +- OCR‑библиотека, предоставляющая метод `engine.recognize` — для демонстрации будем считать, что существует вымышленный пакет `aocr`, но вы можете заменить его на Tesseract, EasyOCR и т.п. +- Модуль‑помощник `ai`, показанный в кодовом фрагменте (он отвечает за инициализацию модели и очистку ресурсов) +- Папка, заполненная PNG‑файлами, которые нужно обработать + +Если у вас нет `aocr` или `ai`, их можно имитировать с помощью заглушек — см. раздел «Optional Stubs» в конце. + +## Step 1: Initialize the AI Engine (Free AI Resources) + +Прежде чем подавать изображение в OCR‑конвейер, базовая модель должна быть готова. Инициализация один раз экономит память и ускоряет пакетные задания. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Why this matters:** +Вызов `ai.initialize` для каждого изображения заново будет выделять GPU‑память каждый раз, в итоге скрипт может упасть. Проверка `ai.is_initialized()` гарантирует единственное выделение — это и есть принцип «free AI resources». + +## Step 2: Load PNG Image Files for Batch OCR Processing + +Теперь собираем все PNG‑файлы, которые хотим пропустить через OCR. Использование `pathlib` делает код независимым от ОС. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Edge case:** +Если в папке есть файлы не‑PNG (например, JPEG), они будут проигнорированы, и `engine.recognize` не «запнётся» из‑за неподдерживаемого формата. + +## Step 3: Run OCR on Each Image and Apply Post‑Processing + +С готовым движком и списком файлов можно перебрать изображения, извлечь сырой текст и передать его пост‑процессору, который убирает типичные артефакты OCR (например, лишние разрывы строк). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Why we separate loading from recognition:** +`aocr.Image.load` может выполнять ленивое декодирование, что быстрее при больших партиях. Явный шаг загрузки также упрощает замену библиотеки изображений, если позже понадобится поддержка JPEG или TIFF. + +## Step 4: Clean Up – Free AI Resources After the Batch + +После завершения пакетной обработки необходимо освободить модель, чтобы избежать утечек памяти, особенно на машинах с GPU. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Putting It All Together – The Complete Script + +Ниже один файл, который соединяет четыре шага в единый рабочий процесс. Сохраните его как `batch_ocr.py` и запустите из командной строки. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Expected Output + +Запуск скрипта в папке с тремя PNG‑файлами может вывести: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +Файл `ocr_results.txt` будет содержать чёткий разделитель для каждого изображения, за которым следует очищенный OCR‑текст. + +## Optional Stubs for aocr & ai (If You Don’t Have Real Packages) + +Если хотите проверить поток без тяжёлых OCR‑библиотек, можно создать минимальные mock‑модули: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Поместите эти папки рядом с `batch_ocr.py`, и скрипт выполнится, выводя имитационные результаты. + +## Pro Tips & Common Pitfalls + +- **Memory spikes:** При обработке тысяч высокоразрешённых PNG‑файлов подумайте о их масштабировании перед OCR. `aocr.Image.load` часто принимает аргумент `max_size`. +- **Unicode handling:** Всегда открывайте файл вывода с `encoding="utf-8"`; OCR‑движки могут генерировать символы вне ASCII. +- **Parallelism:** Для CPU‑ограниченного OCR можно обернуть `ocr_batch` в `concurrent.futures.ThreadPoolExecutor`. Главное — сохранять один экземпляр `ai`; создание множества потоков, каждый из которых вызывает `ai.initialize`, разрушит цель «free AI resources». +- **Error resilience:** Оберните цикл по изображениям в `try/except`, чтобы один повреждённый PNG не прерывал всю пакетную обработку. + +## Conclusion + +Теперь у вас есть **python ocr tutorial**, демонстрирующий, как **load png image** файлы, выполнять **batch OCR processing** и ответственно управлять **free AI resources**. Полный, готовый к запуску пример показывает, как **recognize text from image** объектов и как правильно освобождать ресурсы, так что вы можете скопировать‑вставить его в свои проекты без поиска недостающих частей. + +Готовы к следующему шагу? Попробуйте заменить заглушки `aocr` и `ai` реальными библиотеками, например `pytesseract` и `torchvision`. Вы также можете расширить скрипт, чтобы выводить JSON, отправлять результаты в базу данных или интегрировать с облачным хранилищем. Возможности безграничны — приятного кодинга! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/russian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/russian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..c69b1d6b7 --- /dev/null +++ b/ocr/russian/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,256 @@ +--- +category: general +date: 2026-05-03 +description: Узнайте, как выполнять OCR на изображении и извлекать текст с координатами, + используя структурированное распознавание OCR. Включён пошаговый код на Python. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: ru +og_description: Запустите OCR на изображении и получите текст с координатами, используя + структурированное распознавание OCR. Полный пример на Python с объяснениями. +og_title: Запустите OCR на изображении – Руководство по извлечению структурированного + текста +tags: +- OCR +- Python +- Computer Vision +title: Запуск OCR на изображении – Полное руководство по извлечению структурированного + текста +url: /ru/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Запуск OCR на изображении – Полное руководство по извлечению структурированного текста + +Когда‑нибудь вам нужно было **запустить OCR на изображении** файлов, но вы не знали, как сохранить точные позиции каждого слова? Вы не одиноки. Во многих проектах — сканирование чеков, оцифровка форм или тестирование UI — вам нужен не только сырой текст, но и ограничительные рамки, которые показывают, где находится каждая строка на картинке. + +Этот учебник покажет вам практический способ *запустить OCR на изображении* с использованием движка **aocr**, запросить **structured OCR recognition**, а затем выполнить пост‑обработку результата, сохранив геометрию. К концу вы сможете **извлечь текст с координатами** всего в несколько строк Python и поймёте, почему структурированный режим важен для последующих задач. + +## Что вы узнаете + +- Как инициализировать OCR‑движок для **structured OCR recognition**. +- Как передать изображение и получить необработанные результаты, включающие границы строк. +- Как запустить пост‑процессор, который очищает текст, не теряя геометрию. +- Как перебрать окончательные строки и вывести каждый кусок текста вместе с его ограничительной рамкой. + +Никакой магии, никаких скрытых шагов — просто полностью готовый, исполняемый пример, который вы можете вставить в свой проект. + +--- + +## Предварительные требования + +Прежде чем погрузиться, убедитесь, что у вас установлено следующее: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Вам также понадобится файл изображения (`input_image.png` или `.jpg`), содержащий чёткий, читаемый текст. Подойдёт всё, от отсканированного счета до скриншота, при условии, что OCR‑движок может увидеть символы. + +--- + +## Шаг 1: Инициализировать OCR‑движок для структурированного распознавания + +Первое, что мы делаем, — создаём экземпляр `aocr.Engine()` и указываем, что нам нужен **structured OCR recognition**. Структурированный режим возвращает не только простой текст, но и геометрические данные (ограничительные прямоугольники) для каждой строки, что необходимо, когда нужно сопоставить текст с изображением. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Почему это важно:** +> В режиме по умолчанию движок может вернуть лишь одну строку склеенных слов. Структурированный режим предоставляет иерархию страниц → строк → слов, каждая с координатами, что значительно упрощает наложение результатов на оригинальное изображение или передачу их в модель, учитывающую макет. + +--- + +## Шаг 2: Запустить OCR на изображении и получить необработанные результаты + +Теперь передаём изображение в движок. Вызов `recognize` возвращает объект `OcrResult`, содержащий коллекцию строк, каждая со своим ограничительным прямоугольником. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +На данном этапе `raw_result.lines` содержит объекты с двумя важными атрибутами: + +- `text` – распознанная строка для этой линии. +- `bounds` – кортеж вида `(x, y, width, height)`, описывающий позицию строки. + +--- + +## Шаг 3: Пост‑обработка с сохранением геометрии + +Сырой вывод OCR часто шумный: лишние символы, неправильные пробелы или проблемы с переносами строк. Функция `ai.run_postprocessor` очищает текст, **сохраняя оригинальную геометрию**, так что координаты остаются точными. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Pro tip:** Если у вас есть словари, специфичные для домена (например, коды продуктов), передайте пользовательский словарь в пост‑процессор для повышения точности. + +--- + +## Шаг 4: Извлечь текст с координатами – перебрать и отобразить + +Наконец, мы проходим по очищенным строкам, выводя ограничительную рамку каждой строки рядом с её текстом. Это и есть ядро **извлечения текста с координатами**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Ожидаемый вывод + +Предположим, что входное изображение содержит две строки: “Invoice #12345” и “Total: $89.99”, вы увидите примерно следующее: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +Первый кортеж — это `(x, y, width, height)` строки на оригинальном изображении, позволяющий рисовать прямоугольники, выделять текст или передавать координаты в другую систему. + +--- + +## Визуализация результата (по желанию) + +Если хотите увидеть ограничительные рамки, наложенные на изображение, можно воспользоваться Pillow (PIL) для рисования прямоугольников. Ниже быстрый фрагмент кода; пропустите, если нужны только сырые данные. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image пример, показывающий ограничительные рамки](/images/ocr-bounding-boxes.png "run OCR on image – наложение ограничительных рамок") + +Текст alt выше содержит **основное ключевое слово**, удовлетворяя требованиям SEO для атрибутов alt изображений. + +--- + +## Почему структурированное OCR‑распознавание превосходит простое извлечение текста + +Вы можете задаться вопросом: «Не могу ли я просто запустить OCR и получить текст? Зачем нужна геометрия?» + +- **Пространственный контекст:** Когда нужно сопоставить поля в форме (например, «Дата» рядом со значением даты), координаты показывают, *где* находятся данные. +- **Много‑колоночные макеты:** Простой линейный текст теряет порядок; структурированные данные сохраняют порядок колонок. +- **Точность пост‑обработки:** Знание размеров рамки помогает решить, является ли слово заголовком, сноской или случайным артефактом. + +Короче говоря, **structured OCR recognition** даёт гибкость для построения более умных конвейеров — будь то загрузка данных в базу, создание поисковых PDF или обучение модели машинного обучения, учитывающей макет. + +--- + +## Распространённые граничные случаи и как их решить + +| Ситуация | На что обратить внимание | Предлагаемое решение | +|-----------|--------------------------|----------------------| +| **Повернутые или искривлённые изображения** | Ограничительные рамки могут быть смещены. | Предобработайте с выравниванием (например, `warpAffine` из OpenCV). | +| **Очень маленькие шрифты** | Движок может пропустить символы, что приводит к пустым строкам. | Увеличьте разрешение изображения или используйте `ocr_engine.set_dpi(300)`. | +| **Смешанные языки** | Неправильная языковая модель может вызвать искажённый текст. | Установите `ocr_engine.language = ["en", "de"]` перед распознаванием. | +| **Перекрывающиеся рамки** | Пост‑процессор может непреднамеренно объединить две строки. | Проверьте `line.bounds` после обработки; скорректируйте пороги в `ai.run_postprocessor`. | + +Раннее решение этих сценариев экономит вам головную боль позже, особенно когда вы масштабируете решение до сотен документов в день. + +--- + +## Полный скрипт «от начала до конца» + +Ниже полностью готовая к запуску программа, объединяющая все шаги. Скопируйте‑вставьте, поправьте путь к изображению — и всё готово. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Запуск этого скрипта: + +1. **Запустить OCR на изображении** в структурированном режиме. +2. **Извлечь текст с координатами** для каждой строки. +3. При желании создать аннотированный PNG, показывающий рамки. + +--- + +## Заключение + +Теперь у вас есть надёжное, автономное решение для **запуска OCR на изображении** и **извлечения текста с координатами** с помощью **structured OCR recognition**. Код демонстрирует каждый шаг — от инициализации движка до пост‑обработки и визуальной проверки — так что вы можете адаптировать его под чеки, формы или любые визуальные документы, требующие точного позиционирования текста. + +Что дальше? Попробуйте заменить движок `aocr` на другую библиотеку (Tesseract, EasyOCR) и посмотрите, чем отличаются их структурированные выводы. Поэкспериментируйте с разными стратегиями пост‑обработки, например, проверкой орфографии или пользовательскими regex‑фильтрами, чтобы повысить точность в вашем домене. А если вы строите более крупный конвейер, рассмотрите возможность сохранения пар `(text, bounds)` в базе данных для последующего анализа. + +Счастливого кодинга, и пусть ваши OCR‑проекты всегда будут точными! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/spanish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/spanish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..a97794af6 --- /dev/null +++ b/ocr/spanish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,230 @@ +--- +category: general +date: 2026-05-03 +description: extraer texto de una imagen usando Aspose OCR y corrección ortográfica + con IA. aprende cómo hacer OCR a una imagen, cargar la imagen para OCR, reconocer + texto de una factura y liberar los recursos de GPU. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: es +og_description: extrae texto de una imagen con Aspose OCR y corrección ortográfica + AI. Guía paso a paso que cubre cómo hacer OCR a una imagen, cargar la imagen para + OCR y liberar los recursos de GPU. +og_title: extraer texto de una imagen – Guía completa de OCR y corrección ortográfica +tags: +- OCR +- Aspose +- AI +- Python +title: extraer texto de una imagen – OCR con Aspose AI Spell‑Check +url: /es/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# extraer texto de imagen – Guía completa de OCR y corrección ortográfica + +¿Alguna vez necesitaste **extraer texto de imagen** pero no estabas seguro de qué biblioteca te ofrecería tanto velocidad como precisión? No eres el único. En muchos proyectos del mundo real —piense en procesamiento de facturas, digitalización de recibos o escaneo de contratos— obtener texto limpio y buscable a partir de una foto es el primer obstáculo. + +La buena noticia es que Aspose OCR combinado con un modelo ligero de Aspose AI puede manejar esa tarea en unas pocas líneas de Python. En este tutorial recorreremos **cómo hacer OCR a una imagen**, cargaremos la foto correctamente, ejecutaremos un post‑procesador de corrección ortográfica incorporado y, finalmente, **liberaremos los recursos de GPU** para que tu aplicación sea amigable con la memoria. + +Al final de esta guía podrás **reconocer texto de facturas** en imágenes, corregir automáticamente errores comunes de OCR y mantener tu GPU limpia para el siguiente lote. + +--- + +## Lo que necesitarás + +- Python 3.9 o superior (el código usa anotaciones de tipo pero funciona en versiones 3.x anteriores) +- paquetes `aspose-ocr` y `aspose-ai` (instalar mediante `pip install aspose-ocr aspose-ai`) +- Una GPU con soporte CUDA es opcional; el script recurrirá a la CPU si no se detecta ninguna. +- Una imagen de ejemplo, por ejemplo `sample_invoice.png`, ubicada en una carpeta a la que puedas hacer referencia. + +Sin frameworks de ML pesados, sin descargas masivas de modelos —solo un pequeño modelo cuantizado Q4‑K‑M que cabe cómodamente en la mayoría de las GPUs. + +--- + +## Paso 1: Inicializar el motor OCR – extraer texto de imagen + +Lo primero que haces es crear una instancia de `OcrEngine` y especificar el idioma que esperas. Aquí elegimos inglés y solicitamos salida en texto plano, lo cual es ideal para el procesamiento posterior. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Por qué es importante:** Establecer el idioma reduce el conjunto de caracteres, mejorando la precisión. El modo de texto plano elimina la información de diseño que normalmente no necesitas cuando solo deseas extraer texto de una imagen. + +--- + +## Paso 2: Cargar imagen para OCR – cómo hacer OCR a una imagen + +Ahora alimentamos al motor con una imagen real. El asistente `Image.load` reconoce formatos comunes (PNG, JPEG, TIFF) y abstrae las peculiaridades de la entrada/salida de archivos. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Consejo:** Si tus imágenes de origen son grandes, considera redimensionarlas antes de enviarlas al motor; dimensiones más pequeñas pueden reducir el uso de memoria de la GPU sin perjudicar la calidad del reconocimiento. + +--- + +## Paso 3: Configurar el modelo Aspose AI – reconocer texto de facturas + +Aspose AI incluye un pequeño modelo GGUF que puedes descargar automáticamente. El ejemplo usa el repositorio `Qwen2.5‑3B‑Instruct‑GGUF`, cuantizado a `q4_k_m`. También indicamos al tiempo de ejecución que asigne 20 capas en la GPU, lo que equilibra velocidad y uso de VRAM. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Detrás de cámaras:** El modelo cuantizado ocupa aproximadamente 1.5 GB en disco, una fracción de un modelo de precisión completa, pero aún captura suficiente matiz lingüístico para detectar errores típicos de OCR. + +--- + +## Paso 4: Inicializar AsposeAI y adjuntar el post‑procesador de corrección ortográfica + +Aspose AI incluye un post‑procesador de corrección ortográfica listo para usar. Al adjuntarlo, cada resultado de OCR se limpiará automáticamente. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**¿Por qué usar el post‑procesador?** Los motores OCR a menudo leen “Invoice” como “Invo1ce” o “Total” como “T0tal”. La corrección ortográfica ejecuta un modelo de lenguaje ligero sobre la cadena cruda y corrige esos errores sin que tengas que escribir un diccionario personalizado. + +--- + +## Paso 5: Ejecutar el post‑procesador de corrección ortográfica sobre el resultado OCR + +Con todo conectado, una sola llamada produce el texto corregido. También imprimimos tanto la versión original como la limpiada para que puedas ver la mejora. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Una salida típica para una factura podría verse así: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Observa cómo “Invo1ce” se convirtió en la palabra correcta “Invoice”. Ese es el poder de la corrección ortográfica AI incorporada. + +--- + +## Paso 6: Liberar recursos de GPU – liberar recursos de GPU de forma segura + +Si ejecutas esto en un servicio de larga duración (por ejemplo, una API web que procesa decenas de facturas por minuto), debes liberar el contexto de GPU después de cada lote. De lo contrario verás fugas de memoria y eventualmente obtendrás errores de “CUDA out of memory”. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Consejo profesional:** Llama a `free_resources()` dentro de un bloque `finally` o un gestor de contexto para que siempre se ejecute, incluso si ocurre una excepción. + +--- + +## Ejemplo completo funcional + +Unir todas las piezas te brinda un script autocontenido que puedes incorporar en cualquier proyecto. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Guarda el archivo, ajusta la ruta a tu imagen y ejecuta `python extract_text_from_image.py`. Deberías ver el texto de la factura limpiado impreso en la consola. + +--- + +## Preguntas frecuentes (FAQ) + +**Q: ¿Funciona en máquinas solo CPU?** +A: Absolutamente. Si no se detecta GPU, Aspose AI recurre a la ejecución en CPU, aunque será más lento. Puedes forzar la CPU estableciendo `model_cfg.gpu_layers = 0`. + +**Q: ¿Qué pasa si mis facturas están en un idioma distinto al inglés?** +A: Cambia `ocr_engine.language` al valor enum apropiado (por ejemplo, `aocr.Language.Spanish`). El modelo de corrección ortográfica es multilingüe, pero puedes obtener mejores resultados con un modelo específico para el idioma. + +**Q: ¿Puedo procesar múltiples imágenes en un bucle?** +A: Sí. Simplemente mueve los pasos de carga, reconocimiento y post‑procesamiento dentro de un bucle `for`. Recuerda llamar a `ocr_ai.free_resources()` después del bucle o después de cada lote si reutilizas la misma instancia AI. + +**Q: ¿Qué tamaño tiene la descarga del modelo?** +A: Aproximadamente 1.5 GB para la versión cuantizada `q4_k_m`. Se almacena en caché después de la primera ejecución, por lo que ejecuciones posteriores son instantáneas. + +--- + +## Conclusión + +En este tutorial demostramos cómo **extraer texto de imagen** usando Aspose OCR, configurar un pequeño modelo AI, aplicar un post‑procesador de corrección ortográfica y liberar de forma segura los **recursos de GPU**. El flujo de trabajo cubre todo, desde cargar la foto hasta limpiar después de ti, brindándote una canalización confiable para escenarios de **reconocer texto de facturas**. + +¿Próximos pasos? Intenta reemplazar la corrección ortográfica por un modelo personalizado de extracción de entidades + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/spanish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/spanish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..f482fef7a --- /dev/null +++ b/ocr/spanish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,201 @@ +--- +category: general +date: 2026-05-03 +description: Cómo procesar OCR por lotes de imágenes usando Aspose OCR y corrección + ortográfica con IA. Aprende a extraer texto de imágenes, aplicar corrección ortográfica, + recursos de IA gratuitos y corregir errores de OCR. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: es +og_description: Cómo procesar OCR por lotes de imágenes usando Aspose OCR y corrección + ortográfica con IA. Sigue una guía paso a paso para extraer texto de imágenes, aplicar + la corrección ortográfica, liberar recursos de IA y corregir errores de OCR. +og_title: Cómo realizar OCR por lotes con Aspose OCR – Tutorial completo de Python +tags: +- OCR +- Python +- AI +- Aspose +title: Cómo realizar OCR por lotes con Aspose OCR – Guía completa de Python +url: /es/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Cómo procesar OCR por lotes con Aspose OCR – Guía completa en Python + +¿Alguna vez te has preguntado **cómo procesar OCR por lotes** una carpeta completa de PDFs escaneados o fotos sin escribir un script separado para cada archivo? No estás solo. En muchos flujos de trabajo reales necesitarás **extraer texto de imágenes**, corregir errores ortográficos y, finalmente, liberar los recursos de IA que hayas asignado. Este tutorial te muestra exactamente cómo hacerlo con Aspose OCR, un post‑procesador de IA ligero, y unas pocas líneas de Python. + +Recorreremos la inicialización del motor OCR, la conexión de un corrector ortográfico de IA, el bucle sobre un directorio de imágenes y la limpieza del modelo al final. Al terminar tendrás un script listo para ejecutar que **corrige errores de OCR** automáticamente y libera **recursos de IA** para que tu GPU se mantenga feliz. + +## Lo que necesitarás + +- Python 3.9+ (el código usa type‑hints pero funciona en versiones 3.x anteriores) +- `asposeocr` package (`pip install asposeocr`) – este proporciona el motor OCR. +- Acceso al modelo de Hugging Face `bartowski/Qwen2.5-3B-Instruct-GGUF` (se descarga automáticamente). +- Una GPU con al menos unos GB de VRAM (el script establece `gpu_layers = 30`, puedes reducirlo si es necesario). + +Sin servicios externos, sin APIs de pago – todo se ejecuta localmente. + +--- + +## Paso 1: Configurar el motor OCR – **Cómo procesar OCR por lotes** de manera eficiente + +Antes de poder procesar mil imágenes necesitamos un motor OCR sólido. Aspose OCR nos permite elegir el idioma y el modo de reconocimiento en una sola llamada. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Por qué es importante:** Configurar `recognize_mode` a `Plain` mantiene la salida ligera, lo cual es ideal cuando planeas ejecutar una corrección ortográfica después. Si necesitaras información de diseño, cambiarías a `Layout`, pero eso añade sobrecarga que probablemente no quieras en un trabajo por lotes. + +> **Consejo profesional:** Si estás trabajando con escaneos multilingües, puedes pasar una lista como `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +## Paso 2: Inicializar el post‑procesador de IA – **Aplicar corrección ortográfica** a la salida OCR + +Aspose AI incluye un post‑procesador incorporado que puede ejecutar cualquier modelo que desees. Aquí obtenemos un modelo Qwen 2.5 cuantizado de Hugging Face y conectamos la rutina de corrección ortográfica. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Por qué es importante:** El modelo está cuantizado (`q4_k_m`), lo que reduce drásticamente el uso de memoria mientras sigue ofreciendo una comprensión lingüística decente. Al llamar a `set_post_processor` indicamos a Aspose AI que ejecute automáticamente el paso de **aplicar corrección ortográfica** en cualquier cadena que le pasemos. + +> **Cuidado:** Si tu GPU no puede manejar 30 capas, reduce el número a 15 o incluso 5 – el script seguirá funcionando, solo un poco más lento. + +## Paso 3: Ejecutar OCR y **corregir errores de OCR** en una sola imagen + +Ahora que tanto el motor OCR como el corrector ortográfico de IA están listos, los combinamos. Esta función carga una imagen, extrae el texto bruto y luego ejecuta el post‑procesador de IA para limpiarlo. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Por qué es importante:** Alimentar directamente la cadena OCR cruda al modelo de IA nos brinda una pasada de **corregir errores de OCR** sin escribir expresiones regulares ni diccionarios personalizados. El modelo entiende el contexto, por lo que puede corregir “recieve” → “receive” y errores aún más sutiles. + +## Paso 4: **Extraer texto de imágenes** en lote – El bucle real por lotes + +Aquí es donde brilla la magia de **cómo procesar OCR por lotes**. Iteramos sobre un directorio, omitimos archivos no compatibles y escribimos cada salida corregida en un archivo `.txt`. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Salida esperada + +Para una imagen que contiene la frase *“The quick brown fox jumps over the lazzy dog.”* verás un archivo de texto con: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Observa que la doble “z” se corrigió automáticamente – eso es la corrección ortográfica de IA en acción. + +**Por qué es importante:** Al crear los objetos OCR y IA **una sola vez** y reutilizarlos, evitamos la sobrecarga de cargar el modelo para cada archivo. Esta es la forma más eficiente de **procesar OCR por lotes** a gran escala. + +## Paso 5: Limpieza – **Liberar recursos de IA** correctamente + +Cuando termines, llamar a `free_resources()` libera la memoria de la GPU, los contextos CUDA y cualquier archivo temporal que haya creado el modelo. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Omitir este paso puede dejar asignaciones de GPU colgantes, lo que podría bloquear procesos posteriores de Python o consumir VRAM. Piensa en ello como la parte de “apagar las luces” de un trabajo por lotes. + +## Problemas comunes y consejos extra + +| Problema | Qué buscar | Solución | +|----------|------------|----------| +| **Errores de falta de memoria** | La GPU se queda sin memoria después de unas decenas de imágenes | Reduce `gpu_layers` o cambia a CPU (`model_cfg.gpu_layers = 0`). | +| **Paquete de idioma faltante** | OCR devuelve cadenas vacías | Asegúrate de que la versión de `asposeocr` incluya los datos de idioma inglés; reinstala si es necesario. | +| **Archivos no imagen** | El script se bloquea con un `.pdf` inesperado | La condición `if not file_name.lower().endswith(...)` ya los omite. | +| **Corrección ortográfica no aplicada** | La salida es idéntica al OCR bruto | Verifica que `ai_processor.set_post_processor` se haya llamado antes del bucle. | +| **Velocidad de lote lenta** | Toma >5 segundos por imagen | Habilita `model_cfg.allow_auto_download = "false"` después de la primera ejecución, para que el modelo no se vuelva a descargar cada vez. | + +**Consejo profesional:** Si necesitas **extraer texto de imágenes** en un idioma distinto al inglés, simplemente cambia `ocr_engine.language` al enum correspondiente (p.ej., `aocr.Language.French`). El mismo post‑procesador de IA seguirá aplicando la corrección ortográfica, pero podrías querer un modelo específico del idioma para obtener los mejores resultados. + +## Resumen y próximos pasos + +Hemos cubierto todo el flujo para **procesar OCR por lotes**: + +1. **Inicializar** un motor OCR de texto plano para inglés. +2. **Configurar** un modelo de corrección ortográfica de IA y enlazarlo como post‑procesador. +3. **Ejecutar** OCR en cada imagen y dejar que la IA **corrija errores de OCR** automáticamente. +4. **Iterar** sobre un directorio para **extraer texto de imágenes** en lote. +5. **Liberar recursos de IA** una vez que el trabajo termina. + +A partir de aquí podrías: + +- Pasar el texto corregido a una canalización NLP posterior (análisis de sentimiento, extracción de entidades, etc.). +- Cambiar el post‑procesador de corrección ortográfica por un resumidor personalizado llamando a `ai_processor.set_post_processor(your_custom_func, {})`. +- Paralelizar el bucle de la carpeta con `concurrent.futures.ThreadPoolExecutor` si tu GPU puede manejar múltiples flujos. + +## Reflexiones finales + +Procesar OCR por lotes no tiene que ser una tarea tediosa. Al combinar Aspose OCR con un modelo de IA ligero, obtienes una **solución integral** que **extrae texto de imágenes**, **aplica corrección ortográfica**, **corrige errores de OCR** y **libera recursos de IA** de forma limpia. Prueba el script en una carpeta de prueba, ajusta el número de capas de GPU para que coincida con tu hardware, y tendrás una canalización lista para producción en minutos. + +¿Tienes preguntas sobre ajustar el modelo, manejar PDFs o integrar esto en un servicio web? Deja un comentario abajo o envíame un mensaje en GitHub. ¡Feliz codificación, y que tu OCR sea siempre preciso! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/spanish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/spanish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..05a1b8f6a --- /dev/null +++ b/ocr/spanish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,300 @@ +--- +category: general +date: 2026-05-03 +description: Tutorial de OCR en Python que muestra cómo cargar archivos de imagen + PNG, reconocer texto de la imagen y recursos de IA gratuitos para procesamiento + por lotes de OCR. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: es +og_description: El tutorial de OCR en Python te guía a través de la carga de imágenes + PNG, el reconocimiento de texto en la imagen y el manejo de recursos de IA gratuitos + para el procesamiento por lotes de OCR. +og_title: Tutorial de OCR en Python – OCR por lotes rápido con recursos de IA gratuitos +tags: +- OCR +- Python +- AI +title: Tutorial de OCR en Python – Procesamiento por lotes de OCR fácil +url: /es/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Tutorial de OCR en Python – Procesamiento por lotes de OCR fácil + +¿Alguna vez necesitaste un **python ocr tutorial** que realmente te permita ejecutar OCR en docenas de archivos PNG sin volverte loco? No estás solo. En muchos proyectos del mundo real tienes que **load png image** archivos, alimentarlos a un motor y luego limpiar los recursos de IA cuando terminas. + +En esta guía recorreremos un ejemplo completo, listo‑para‑ejecutar, que muestra exactamente cómo **recognize text from image** archivos, procesarlos por lotes y liberar la memoria de IA subyacente. Al final tendrás un script autónomo que puedes incorporar a cualquier proyecto—sin adornos extra, solo lo esencial. + +## Lo que necesitarás + +- Python 3.10 o superior (la sintaxis usada aquí depende de f‑strings y anotaciones de tipo) +- Una biblioteca OCR que exponga un método `engine.recognize` – para propósitos de demostración asumiremos un paquete ficticio `aocr`, pero puedes sustituirlo por Tesseract, EasyOCR, etc. +- El módulo auxiliar `ai` mostrado en el fragmento de código (maneja la inicialización del modelo y la limpieza de recursos) +- Una carpeta llena de archivos PNG que deseas procesar + +Si no tienes `aocr` o `ai` instalados, puedes imitarlos con stubs – consulta la sección “Optional Stubs” al final. + +## Paso 1: Inicializar el motor de IA (Liberar recursos de IA) + +Antes de alimentar cualquier imagen al pipeline de OCR, el modelo subyacente debe estar listo. Inicializar solo una vez ahorra memoria y acelera los trabajos por lotes. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Por qué esto importa:** +Llamar a `ai.initialize` repetidamente para cada imagen asignaría memoria GPU una y otra vez, eventualmente haciendo que el script se bloquee. Al comprobar `ai.is_initialized()` garantizamos una única asignación – ese es el principio de “free AI resources”. + +## Paso 2: Cargar archivos de imagen PNG para procesamiento OCR por lotes + +Ahora recopilamos todos los archivos PNG que queremos procesar con OCR. Usar `pathlib` mantiene el código independiente del SO. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Caso límite:** +Si la carpeta contiene archivos que no son PNG (p. ej., JPEG) serán ignorados, evitando que `engine.recognize` falle por un formato no soportado. + +## Paso 3: Ejecutar OCR en cada imagen y aplicar post‑procesamiento + +Con el motor listo y la lista de archivos preparada, podemos iterar sobre las imágenes, extraer texto bruto y pasarlo a un post‑procesador que limpia artefactos comunes de OCR (como saltos de línea erróneos). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Por qué separamos la carga del reconocimiento:** +`aocr.Image.load` puede realizar decodificación perezosa, lo que es más rápido para lotes grandes. Mantener el paso de carga explícito también facilita cambiar a una biblioteca de imágenes diferente si más adelante necesitas manejar archivos JPEG o TIFF. + +## Paso 4: Limpieza – Liberar recursos de IA después del lote + +Una vez que el lote ha finalizado, debemos liberar el modelo para evitar fugas de memoria, especialmente en máquinas con GPU. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Juntándolo todo – El script completo + +A continuación tienes un único archivo que une los cuatro pasos en un flujo de trabajo cohesivo. Guárdalo como `batch_ocr.py` y ejecútalo desde la línea de comandos. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Salida esperada + +Ejecutar el script contra una carpeta que contiene tres PNGs podría imprimir: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +El archivo `ocr_results.txt` contendrá un delimitador claro para cada imagen seguido del texto OCR limpiado. + +## Stubs opcionales para aocr & ai (Si no tienes paquetes reales) + +Si solo deseas probar el flujo sin cargar bibliotecas OCR pesadas, puedes crear módulos simulados mínimos: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Coloca estas carpetas junto a `batch_ocr.py` y el script se ejecutará, imprimiendo resultados simulados. + +## Consejos profesionales y errores comunes + +- **Picos de memoria:** Si estás procesando miles de PNG de alta resolución, considera redimensionarlos antes del OCR. `aocr.Image.load` a menudo acepta un argumento `max_size`. +- **Manejo de Unicode:** Siempre abre el archivo de salida con `encoding="utf-8"`; los motores OCR pueden emitir caracteres no ASCII. +- **Paralelismo:** Para OCR limitado por CPU puedes envolver `ocr_batch` en un `concurrent.futures.ThreadPoolExecutor`. Solo recuerda mantener una única instancia de `ai` – generar muchos hilos que cada uno llame a `ai.initialize` derrota el objetivo de “free AI resources”. +- **Resiliencia ante errores:** Envuelve el bucle por‑imagen en un bloque `try/except` para que un solo PNG corrupto no aborta todo el lote. + +## Conclusión + +Ahora tienes un **python ocr tutorial** que demuestra cómo **load png image** archivos, realizar **batch OCR processing**, y gestionar responsablemente **free AI resources**. El ejemplo completo y ejecutable muestra exactamente cómo **recognize text from image** objetos y limpiar después, para que puedas copiar‑pegarlo en tus propios proyectos sin buscar piezas faltantes. + +¿Listo para el siguiente paso? Prueba intercambiar los módulos simulados `aocr` y `ai` por bibliotecas reales como `pytesseract` y `torchvision`. También puedes extender el script para generar JSON, enviar resultados a una base de datos o integrarlo con un bucket de almacenamiento en la nube. El cielo es el límite—¡feliz codificación! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/spanish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/spanish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..e429da8c5 --- /dev/null +++ b/ocr/spanish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: Aprende cómo ejecutar OCR en una imagen y extraer texto con coordenadas + usando reconocimiento OCR estructurado. Código Python paso a paso incluido. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: es +og_description: Ejecuta OCR en una imagen y obtén el texto con coordenadas usando + reconocimiento OCR estructurado. Ejemplo completo en Python con explicaciones. +og_title: Ejecutar OCR en una imagen – Tutorial de extracción de texto estructurado +tags: +- OCR +- Python +- Computer Vision +title: Ejecutar OCR en una imagen – Guía completa para la extracción de texto estructurado +url: /es/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Ejecutar OCR en imagen – Guía completa para la extracción de texto estructurado + +¿Alguna vez necesitaste **ejecutar OCR en imagen** pero no estabas seguro de cómo mantener las posiciones exactas de cada palabra? No estás solo. En muchos proyectos—escaneo de recibos, digitalización de formularios o pruebas de UI—necesitas no solo el texto bruto sino también los cuadros delimitadores que indican dónde se encuentra cada línea en la foto. + +Este tutorial te muestra una forma práctica de *ejecutar OCR en imagen* usando el motor **aocr**, solicitar **reconocimiento OCR estructurado**, y luego post‑procesar el resultado preservando la geometría. Al final podrás **extraer texto con coordenadas** en solo unas pocas líneas de Python, y entenderás por qué el modo estructurado es importante para tareas posteriores. + +## Lo que aprenderás + +- Cómo inicializar el motor OCR para **reconocimiento OCR estructurado**. +- Cómo alimentar una imagen y recibir resultados crudos que incluyen los límites de línea. +- Cómo ejecutar un post‑procesador que limpia el texto sin perder la geometría. +- Cómo iterar sobre las líneas finales e imprimir cada fragmento de texto junto con su cuadro delimitador. + +Sin trucos, sin pasos ocultos—solo un ejemplo completo y ejecutable que puedes incorporar a tu propio proyecto. + +--- + +## Requisitos previos + +Antes de sumergirnos, asegúrate de tener lo siguiente instalado: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +También necesitarás un archivo de imagen (`input_image.png` o `.jpg`) que contenga texto claro y legible. Desde una factura escaneada hasta una captura de pantalla sirve, siempre que el motor OCR pueda ver los caracteres. + +--- + +## Paso 1: Inicializar el motor OCR para reconocimiento estructurado + +Lo primero que hacemos es crear una instancia de `aocr.Engine()` y decirle que queremos **reconocimiento OCR estructurado**. El modo estructurado devuelve no solo el texto plano sino también datos geométricos (rectángulos delimitadores) para cada línea, lo cual es esencial cuando necesitas mapear el texto de vuelta a la imagen. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Por qué importa:** +> En el modo predeterminado el motor podría entregarte solo una cadena de palabras concatenadas. El modo estructurado te brinda una jerarquía de páginas → líneas → palabras, cada una con coordenadas, lo que facilita mucho superponer los resultados sobre la imagen original o alimentarlos a un modelo consciente del diseño. + +--- + +## Paso 2: Ejecutar OCR en la imagen y obtener resultados crudos + +Ahora alimentamos la imagen al motor. La llamada `recognize` devuelve un objeto `OcrResult` que contiene una colección de líneas, cada una con su propio rectángulo delimitador. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +En este punto `raw_result.lines` contiene objetos con dos atributos importantes: + +- `text` – la cadena reconocida para esa línea. +- `bounds` – una tupla como `(x, y, width, height)` que describe la posición de la línea. + +--- + +## Paso 3: Post‑procesar preservando la geometría + +La salida OCR cruda suele ser ruidosa: caracteres errantes, espacios mal ubicados o problemas de saltos de línea. La función `ai.run_postprocessor` limpia el texto pero **mantiene la geometría original** intacta, de modo que sigues teniendo coordenadas precisas. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Consejo profesional:** Si dispones de vocabularios específicos del dominio (p. ej., códigos de producto), proporciona un diccionario personalizado al post‑procesador para mejorar la precisión. + +--- + +## Paso 4: Extraer texto con coordenadas – iterar y mostrar + +Finalmente, recorremos las líneas limpias, imprimiendo el cuadro delimitador de cada línea junto con su texto. Este es el núcleo de **extraer texto con coordenadas**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Salida esperada + +Suponiendo que la imagen de entrada contiene dos líneas: “Invoice #12345” y “Total: $89.99”, verás algo como: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +La primera tupla es el `(x, y, width, height)` de la línea en la imagen original, lo que te permite dibujar rectángulos, resaltar texto o pasar las coordenadas a otro sistema. + +--- + +## Visualizar el resultado (opcional) + +Si deseas ver los cuadros delimitadores superpuestos sobre la imagen, puedes usar Pillow (PIL) para dibujar rectángulos. A continuación tienes un fragmento rápido; si solo necesitas los datos crudos, puedes omitirlo. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image example showing bounding boxes](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +El texto alternativo anterior contiene la **palabra clave principal**, cumpliendo con el requisito SEO para atributos alt de imágenes. + +--- + +## Por qué el reconocimiento OCR estructurado supera la extracción de texto simple + +Quizá te preguntes, “¿No puedo simplemente ejecutar OCR y obtener el texto? ¿Por qué preocuparme por la geometría?” + +- **Contexto espacial:** Cuando necesitas mapear campos en un formulario (p. ej., “Date” junto al valor de la fecha), las coordenadas indican *dónde* está el dato. +- **Diseños multicolumna:** El texto lineal simple pierde el orden; los datos estructurados preservan el orden de columnas. +- **Precisión del post‑procesamiento:** Conocer el tamaño de la caja te ayuda a decidir si una palabra es un encabezado, una nota al pie o un artefacto errante. + +En resumen, **reconocimiento OCR estructurado** te brinda la flexibilidad para construir pipelines más inteligentes—ya sea que estés alimentando datos a una base de datos, creando PDFs buscables o entrenando un modelo de aprendizaje automático que respete el diseño. + +--- + +## Casos límite comunes y cómo manejarlos + +| Situación | Qué observar | Solución sugerida | +|-----------|--------------|-------------------| +| **Imágenes rotadas o sesgadas** | Los cuadros delimitadores pueden estar desalineados. | Pre‑procesar con corrección de sesgo (p. ej., `warpAffine` de OpenCV). | +| **Fuentes muy pequeñas** | El motor puede omitir caracteres, generando líneas vacías. | Aumentar la resolución de la imagen o usar `ocr_engine.set_dpi(300)`. | +| **Idiomas mixtos** | Un modelo de idioma incorrecto puede producir texto garbled. | Configurar `ocr_engine.language = ["en", "de"]` antes del reconocimiento. | +| **Cajas superpuestas** | El post‑procesador podría fusionar dos líneas inadvertidamente. | Verificar `line.bounds` después del procesamiento; ajustar umbrales en `ai.run_postprocessor`. | + +Abordar estos escenarios desde el principio te ahorra dolores de cabeza más adelante, especialmente cuando escalas la solución a cientos de documentos al día. + +--- + +## Script completo de extremo a extremo + +A continuación tienes el programa completo, listo para ejecutar. Copia‑pega, ajusta la ruta de la imagen y listo. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Ejecutar este script hará lo siguiente: + +1. **Ejecutar OCR en imagen** con modo estructurado. +2. **Extraer texto con coordenadas** para cada línea. +3. Opcionalmente producir un PNG anotado que muestra los cuadros. + +--- + +## Conclusión + +Ahora dispones de una solución sólida y autónoma para **ejecutar OCR en imagen** y **extraer texto con coordenadas** usando **reconocimiento OCR estructurado**. El código muestra cada paso—desde la inicialización del motor hasta el post‑procesamiento y la verificación visual—para que puedas adaptarlo a recibos, formularios o cualquier documento visual que requiera localización precisa del texto. + +¿Qué sigue? Prueba cambiar el motor `aocr` por otra biblioteca (Tesseract, EasyOCR) y observa cómo difieren sus salidas estructuradas. Experimenta con distintas estrategias de post‑procesamiento, como corrección ortográfica o filtros regex personalizados, para mejorar la precisión en tu dominio. Y si construyes una pipeline más grande, considera almacenar los pares `(text, bounds)` en una base de datos para análisis posteriores. + +¡Feliz codificación, y que tus proyectos OCR sean siempre precisos! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/swedish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/swedish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..7f9341621 --- /dev/null +++ b/ocr/swedish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,230 @@ +--- +category: general +date: 2026-05-03 +description: extrahera text från bild med Aspose OCR och AI‑stavningskontroll. Lär + dig hur du OCR:ar en bild, laddar bild för OCR, känner igen text från faktura och + frigör GPU‑resurser. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: sv +og_description: extrahera text från bild med Aspose OCR och AI‑stavningskontroll. + Steg‑för‑steg‑guide som täcker hur man OCR‑ar en bild, laddar bilden för OCR och + frigör GPU‑resurser. +og_title: extrahera text från bild – Fullständig OCR‑ och stavningskontrollguide +tags: +- OCR +- Aspose +- AI +- Python +title: extrahera text från bild – OCR med Aspose AI stavningskontroll +url: /sv/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# extrahera text från bild – Komplett OCR & Spell‑Check Guide + +Har du någonsin behövt **extrahera text från bild** men varit osäker på vilket bibliotek som ger både hastighet och noggrannhet? Du är inte ensam. I många verkliga projekt—tänk fakturabehandling, kvitto‑digitalisering eller skanning av kontrakt—är det första hindret att få ren, sökbar text från en bild. + +Den goda nyheten är att Aspose OCR i kombination med en lättviktig Aspose AI‑modell kan hantera jobbet på några rader Python. I den här handledningen går vi igenom **how to OCR image**, laddar bilden korrekt, kör en inbyggd spell‑check‑post‑processor och slutligen **release GPU resources** så att din app förblir minnesvänlig. + +I slutet av den här guiden kommer du att kunna **recognize text from invoice**‑bilder, automatiskt korrigera vanliga OCR‑fel och hålla ditt GPU rent för nästa batch. + +--- + +## Vad du behöver + +- Python 3.9 eller nyare (koden använder typ‑hints men fungerar på tidigare 3.x‑versioner) +- `aspose-ocr` och `aspose-ai` paket (installera via `pip install aspose-ocr aspose-ai`) +- En CUDA‑aktiverad GPU är valfri; skriptet faller tillbaka till CPU om ingen hittas. +- En exempelbild, t.ex. `sample_invoice.png`, placerad i en mapp du kan referera till. + +Inga tunga ML‑ramverk, inga massiva modellnedladdningar—bara en liten Q4‑K‑M‑kvantiserad modell som passar bekvämt på de flesta GPU:er. + +--- + +## Steg 1: Initiera OCR‑motorn – extrahera text från bild + +Det första du gör är att skapa en `OcrEngine`‑instans och ange vilket språk du förväntar dig. Här väljer vi engelska och begär plain‑text‑utdata, vilket är idealiskt för efterföljande bearbetning. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Varför detta är viktigt:** Att ange språket begränsar teckenuppsättningen, vilket förbättrar noggrannheten. Plain‑text‑läget tar bort layoutinformation som du vanligtvis inte behöver när du bara vill extrahera text från bild. + +--- + +## Steg 2: Ladda bild för OCR – how to OCR image + +Nu matar vi motorn med en faktisk bild. Hjälpmetoden `Image.load` förstår vanliga format (PNG, JPEG, TIFF) och abstraherar bort fil‑IO‑egenskaper. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Tips:** Om dina källbilder är stora, överväg att ändra storlek på dem innan du skickar dem till motorn; mindre dimensioner kan minska GPU‑minnesanvändning utan att försämra igenkänningskvaliteten. + +--- + +## Steg 3: Konfigurera Aspose AI‑modellen – recognize text from invoice + +Aspose AI levereras med en liten GGUF‑modell som du kan auto‑ladda ner. Exemplet använder `Qwen2.5‑3B‑Instruct‑GGUF`‑arkivet, kvantiserat till `q4_k_m`. Vi instruerar också runtime att allokera 20 lager på GPU:n, vilket balanserar hastighet och VRAM‑användning. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Bakom kulisserna:** Den kvantiserade modellen är ungefär 1,5 GB på disk, en bråkdel av en full‑precision modell, men den fångar ändå tillräckligt med språklig nyans för att flagga typiska OCR‑stavefel. + +--- + +## Steg 4: Initiera AsposeAI och anslut spell‑check‑post‑processorn + +Aspose AI inkluderar en färdig spell‑check‑post‑processor. Genom att ansluta den kommer varje OCR‑resultat att rengöras automatiskt. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Varför använda post‑processorn?** OCR‑motorer läser ofta fel på “Invoice” som “Invo1ce” eller “Total” som “T0tal”. Spell‑check kör en lättviktig språkmodell över den råa strängen och korrigerar dessa fel utan att du skriver ett eget lexikon. + +--- + +## Steg 5: Kör spell‑check‑post‑processorn på OCR‑resultatet + +När allt är kopplat får du den korrigerade texten med ett enda anrop. Vi skriver också ut både original- och den rensade versionen så att du kan se förbättringen. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Typisk utdata för en faktura kan se ut så här: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Observera hur “Invo1ce” förvandlades till det korrekta ordet “Invoice”. Det är kraften i den inbyggda AI‑spell‑checken. + +--- + +## Steg 6: Frigör GPU‑resurser – release gpu resources safely + +Om du kör detta i en långlivad tjänst (t.ex. ett webb‑API som bearbetar dussintals fakturor per minut) måste du frigöra GPU‑kontexten efter varje batch. Annars får du minnesläckor och så småningom “CUDA out of memory”-fel. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Pro‑tips:** Anropa `free_resources()` inuti ett `finally`‑block eller en context manager så att det alltid körs, även om ett undantag inträffar. + +--- + +## Fullt fungerande exempel + +Genom att sätta ihop alla delar får du ett självständigt skript som du kan släppa in i vilket projekt som helst. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Spara filen, justera sökvägen till din bild och kör `python extract_text_from_image.py`. Du bör se den rensade fakturatexten skriven till konsolen. + +--- + +## Vanliga frågor (FAQ) + +**Q: Fungerar detta på enbart CPU‑maskiner?** +A: Absolut. Om ingen GPU upptäcks faller Aspose AI tillbaka till CPU‑exekvering, men det blir långsammare. Du kan tvinga CPU genom att sätta `model_cfg.gpu_layers = 0`. + +**Q: Vad händer om mina fakturor är på ett annat språk än engelska?** +A: Ändra `ocr_engine.language` till rätt enum‑värde (t.ex. `aocr.Language.Spanish`). Spell‑check‑modellen är flerspråkig, men du kan få bättre resultat med en språk‑specifik modell. + +**Q: Kan jag bearbeta flera bilder i en loop?** +A: Ja. Flytta bara laddnings‑, igenkännings‑ och post‑processstegen in i en `for`‑loop. Kom ihåg att anropa `ocr_ai.free_resources()` efter loopen eller efter varje batch om du återanvänder samma AI‑instans. + +**Q: Hur stor är modellnedladdningen?** +A: Ungefär 1,5 GB för den kvantiserade `q4_k_m`‑versionen. Den cachas efter första körningen, så efterföljande exekveringar är omedelbara. + +--- + +## Slutsats + +I den här handledningen demonstrerade vi hur man **extract text from image** med Aspose OCR, konfigurerar en liten AI‑modell, tillämpar en spell‑check‑post‑processor och säkert **release GPU resources**. Arbetsflödet täcker allt från att ladda bilden till att städa upp efter dig, vilket ger dig en pålitlig pipeline för **recognize text from invoice**‑scenarier. + +Nästa steg? Prova att byta ut spell‑checken mot en anpassad entity‑extraction‑modell + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/swedish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/swedish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..0d62d6fee --- /dev/null +++ b/ocr/swedish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,215 @@ +--- +category: general +date: 2026-05-03 +description: Så här batchar du OCR på bilder med Aspose OCR och AI‑stavningskontroll. + Lär dig att extrahera text från bilder, tillämpa stavningskontroll, använda gratis + AI‑resurser och korrigera OCR‑fel. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: sv +og_description: Hur man batchar OCR‑bilder med Aspose OCR och AI‑stavningskontroll. + Följ en steg‑för‑steg‑guide för att extrahera text från bilder, tillämpa stavningskontroll, + använda gratis AI‑resurser och korrigera OCR‑fel. +og_title: Hur man batchar OCR med Aspose OCR – Komplett Python‑guide +tags: +- OCR +- Python +- AI +- Aspose +title: Hur man batchar OCR med Aspose OCR – Fullständig Python‑guide +url: /sv/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Hur man batch‑OCR med Aspose OCR – Fullständig Python‑guide + +Har du någonsin undrat **hur man batch‑OCR** en hel mapp med skannade PDF‑filer eller foton utan att skriva ett separat skript för varje fil? Du är inte ensam. I många verkliga pipelines behöver du **extrahera text från bilder**, rensa stavfel och slutligen frigöra eventuella AI‑resurser du har allokerat. Den här handledningen visar exakt hur du gör det med Aspose OCR, en lättviktig AI‑post‑processor, och några rader Python. + +Vi går igenom hur du initierar OCR‑motorn, kopplar in en AI‑stavningskontroll, loopar över en katalog med bilder och rensar upp modellen efteråt. I slutet har du ett färdigt skript som **korrigerar OCR‑fel** automatiskt och frigör **AI‑resurser** så att ditt GPU förblir nöjt. + +## Vad du behöver + +- Python 3.9+ (koden använder type‑hints men fungerar på tidigare 3.x‑versioner) +- `asposeocr`‑paketet (`pip install asposeocr`) – detta tillhandahåller OCR‑motorn. +- Tillgång till Hugging Face‑modellen `bartowski/Qwen2.5-3B-Instruct-GGUF` (hämtas automatiskt). +- Ett GPU med minst några GB VRAM (skriptet sätter `gpu_layers = 30`, du kan sänka det om det behövs). + +Ingen extern tjänst, inga betalda API‑er – allt körs lokalt. + +--- + +## Steg 1: Ställ in OCR‑motorn – **Hur man batch‑OCR** effektivt + +Innan vi kan bearbeta tusen bilder behöver vi en stabil OCR‑motor. Aspose OCR låter oss välja språk och igenkänningsläge i ett enda anrop. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Varför detta är viktigt:** Att sätta `recognize_mode` till `Plain` håller utdata lätta, vilket är idealiskt när du planerar att köra en stavningskontroll senare. Om du behövde layoutinformation skulle du byta till `Layout`, men det lägger till overhead du förmodligen inte vill ha i ett batch‑jobb. + +> **Pro tip:** Om du hanterar flerspråkiga skanningar kan du skicka en lista som `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Steg 2: Initiera AI‑post‑processorn – **Applicera stavningskontroll** på OCR‑utdata + +Aspose AI levereras med en inbyggd post‑processor som kan köra vilken modell du vill. Här hämtar vi en kvantiserad Qwen 2.5‑modell från Hugging Face och kopplar in stavningskontrollen. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Varför detta är viktigt:** Modellen är kvantiserad (`q4_k_m`), vilket kraftigt minskar minnesanvändningen samtidigt som den levererar rimlig språkförståelse. Genom att anropa `set_post_processor` säger vi åt Aspose AI att automatiskt köra **apply spell check**‑steget på varje sträng vi matar in. + +> **Observera:** Om ditt GPU inte klarar 30 lager, sänk antalet till 15 eller till och med 5 – skriptet fungerar fortfarande, bara lite långsammare. + +--- + +## Steg 3: Kör OCR och **korrigera OCR‑fel** på en enskild bild + +Nu när både OCR‑motorn och AI‑stavningskontrollen är redo, kombinerar vi dem. Denna funktion laddar en bild, extraherar råtext och kör sedan AI‑post‑processorn för att rensa upp den. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Varför detta är viktigt:** Genom att direkt föra in den råa OCR‑strängen i AI‑modellen får vi ett **correct OCR errors**‑pass utan att skriva några regex‑mönster eller egna ordböcker. Modellen förstår kontext, så den kan fixa “recieve” → “receive” och ännu mer subtila misstag. + +--- + +## Steg 4: **Extrahera text från bilder** i bulk – den verkliga batch‑loopen + +Här kommer magin med **hur man batch‑OCR** till sin rätt. Vi itererar över en katalog, hoppar över filer som inte stöds och skriver varje korrigerat resultat till en `.txt`‑fil. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Förväntad output + +För en bild som innehåller meningen *“The quick brown fox jumps over the lazzy dog.”* får du en textfil med: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Observera att den dubbla “z”:en korrigerades automatiskt – det är AI‑stavningskontrollen i aktion. + +**Varför detta är viktigt:** Genom att skapa OCR‑ och AI‑objekten **en gång** och återanvända dem undviker vi overheaden av att ladda modellen för varje fil. Detta är det mest effektiva sättet att **hur man batch‑OCR** i skala. + +--- + +## Steg 5: Rensa upp – **Frigör AI‑resurser** korrekt + +När du är klar, frigör anropet `free_resources()` GPU‑minne, CUDA‑kontexter och eventuella temporära filer som modellen skapat. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Att hoppa över detta steg kan lämna hängande GPU‑allokeringar, vilket kan krascha efterföljande Python‑processer eller äta upp VRAM. Tänk på det som “släck lamporna” i ett batch‑jobb. + +--- + +## Vanliga fallgropar & extra tips + +| Problem | Vad du ska leta efter | Lösning | +|-------|------------------|-----| +| **Out‑of‑memory‑fel** | GPU tar slut efter några dussin bilder | Minska `gpu_layers` eller byt till CPU (`model_cfg.gpu_layers = 0`). | +| **Saknat språkpaket** | OCR returnerar tomma strängar | Säkerställ att `asposeocr`‑versionen innehåller engelska språkdata; installera om vid behov. | +| **Icke‑bildfiler** | Skriptet kraschar på en stray `.pdf` | Guard‑satsen `if not file_name.lower().endswith(...)` hoppar redan över dem. | +| **Stavningskontroll ej tillämpad** | Output ser identisk ut med rå OCR | Verifiera att `ai_processor.set_post_processor` anropades innan loopen. | +| **Långsam batch‑hastighet** | Tar >5 sekunder per bild | Aktivera `model_cfg.allow_auto_download = "false"` efter första körningen, så modellen inte laddas ner igen varje gång. | + +**Pro tip:** Om du behöver **extrahera text från bilder** på ett annat språk än engelska, ändra helt enkelt `ocr_engine.language` till rätt enum (t.ex. `aocr.Language.French`). Samma AI‑post‑processor kommer fortfarande att applicera stavningskontroll, men du kanske vill ha en språk‑specifik modell för bästa resultat. + +--- + +## Sammanfattning & nästa steg + +Vi har gått igenom hela pipeline‑processen för **hur man batch‑OCR**: + +1. **Initiera** en plain‑text OCR‑motor för engelska. +2. **Konfigurera** en AI‑stavningskontrollmodell och bind den som post‑processor. +3. **Kör** OCR på varje bild och låt AI **korrigera OCR‑fel** automatiskt. +4. **Loop** över en katalog för att **extrahera text från bilder** i bulk. +5. **Frigör AI‑resurser** när jobbet är klart. + +Från här kan du: + +- Skicka den korrigerade texten vidare till en downstream NLP‑pipeline (sentiment‑analys, entity‑extraction, osv.). +- Byta ut stavningskontroll‑post‑processorn mot en egen summerare genom att anropa `ai_processor.set_post_processor(your_custom_func, {})`. +- Parallelisera katalogloopen med `concurrent.futures.ThreadPoolExecutor` om ditt GPU klarar flera strömmar. + +--- + +## Slutord + +Batch‑OCR behöver inte vara ett krångligt arbete. Genom att utnyttja Aspose OCR tillsammans med en lättviktig AI‑modell får du en **one‑stop‑solution** som **extraherar text från bilder**, **applikerar stavningskontroll**, **korrigerar OCR‑fel**, och **friger AI‑resurser** på ett rent sätt. Prova skriptet på en testmapp, justera GPU‑lagersantalet så det passar din hårdvara, så har du en produktionsklar pipeline på några minuter. + +Har du frågor om att finjustera modellen, hantera PDF‑filer eller integrera detta i en webbtjänst? Lämna en kommentar nedan eller ping mig på GitHub. Happy coding, and may your OCR be ever accurate! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/swedish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/swedish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..1b7afd21b --- /dev/null +++ b/ocr/swedish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,298 @@ +--- +category: general +date: 2026-05-03 +description: Python OCR-handledning som visar hur man laddar PNG‑bildfiler, känner + igen text från bilden och gratis AI‑resurser för batch‑OCR‑behandling. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: sv +og_description: Python OCR-handledning visar dig hur du laddar PNG-bilder, känner + igen text i bilden och hanterar gratis AI-resurser för batch‑OCR‑behandling. +og_title: Python OCR-handledning – Snabb batch-OCR med gratis AI-resurser +tags: +- OCR +- Python +- AI +title: Python OCR-handledning – Batch-OCR-behandling gjort enkelt +url: /sv/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR-handledning – Batch-OCR-behandling gjort enkelt + +Har du någonsin behövt en **python ocr tutorial** som faktiskt låter dig köra OCR på dussintals PNG‑filer utan att dra i håret? Du är inte ensam. I många verkliga projekt måste du **load png image**‑filer, mata dem till en motor och sedan rensa upp AI‑resurserna när du är klar. + +I den här guiden går vi igenom ett komplett, färdigt‑att‑köra exempel som visar exakt hur man **recognize text from image**‑filer, bearbetar dem i batch och frigör det underliggande AI‑minnet. I slutet har du ett självständigt skript som du kan släppa in i vilket projekt som helst—utan extra krusiduller, bara det nödvändigaste. + +## Vad du behöver + +- Python 3.10 eller nyare (syntaxen som används här förlitar sig på f‑strings och typ‑hintar) +- Ett OCR‑bibliotek som exponerar en `engine.recognize`‑metod – för demonstrationsändamål antar vi ett fiktivt `aocr`‑paket, men du kan byta ut det mot Tesseract, EasyOCR osv. +- `ai`‑hjälpmodulen som visas i kodsnutten (den hanterar modellinitiering och resurssanering) +- En mapp full av PNG‑filer som du vill bearbeta + +Om du inte har `aocr` eller `ai` installerat kan du efterlikna dem med stubbar – se avsnittet “Optional Stubs” längre ner. + +## Steg 1: Initiera AI‑motorn (Frigör AI‑resurser) + +Innan du matar in någon bild i OCR‑pipelines måste den underliggande modellen vara redo. Att initiera bara en gång sparar minne och snabbar upp batch‑jobb. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Varför detta är viktigt:** +Att anropa `ai.initialize` upprepade gånger för varje bild skulle allokera GPU‑minne om och om igen, vilket så småningom får skriptet att krascha. Genom att kontrollera `ai.is_initialized()` garanterar vi en enda allokering – det är principen “frigör AI‑resurser”. + +## Steg 2: Ladda PNG‑bildfiler för batch‑OCR‑bearbetning + +Nu samlar vi alla PNG‑filer som vi vill köra genom OCR. Att använda `pathlib` gör koden OS‑agnostisk. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Edge case:** +Om mappen innehåller icke‑PNG‑filer (t.ex. JPEG) kommer de att ignoreras, vilket förhindrar att `engine.recognize` fastnar på ett format som inte stöds. + +## Steg 3: Kör OCR på varje bild och tillämpa efterbehandling + +Med motorn redo och fillistan förberedd kan vi loopa över bilderna, extrahera råtext och skicka den till en efterprocessor som rensar vanliga OCR‑artefakter (som oönskade radbrytningar). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Varför vi separerar laddning från igenkänning:** +`aocr.Image.load` kan utföra lat avkodning, vilket är snabbare för stora batcher. Att hålla laddningssteget explicit gör det också enkelt att byta till ett annat bildbibliotek om du senare behöver hantera JPEG‑ eller TIFF‑filer. + +## Steg 4: Rensa upp – Frigör AI‑resurser efter batchen + +När batchen är klar måste vi släppa modellen för att undvika minnesläckor, särskilt på maskiner med GPU‑stöd. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Sätt ihop allt – Det kompletta skriptet + +Nedan är en enda fil som sammanfogar de fyra stegen till ett sammanhängande arbetsflöde. Spara den som `batch_ocr.py` och kör den från kommandoraden. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Förväntad output + +Att köra skriptet mot en mapp som innehåller tre PNG‑filer kan skriva ut: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +`ocr_results.txt`‑filen kommer att innehålla en tydlig avgränsare för varje bild följt av den rensade OCR‑texten. + +## Valfria stubbar för aocr & ai (Om du inte har riktiga paket) + +Om du bara vill testa flödet utan att dra in tunga OCR‑bibliotek kan du skapa minimala mock‑moduler: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Placera dessa mappar bredvid `batch_ocr.py` så kör skriptet och skriver ut mock‑resultat. + +## Pro‑tips & vanliga fallgropar + +- **Minnesökningar:** Om du bearbetar tusentals högupplösta PNG‑filer, överväg att ändra storlek på dem innan OCR. `aocr.Image.load` accepterar ofta ett `max_size`‑argument. +- **Unicode‑hantering:** Öppna alltid utdatafilen med `encoding="utf-8"`; OCR‑motorer kan generera icke‑ASCII‑tecken. +- **Parallellism:** För CPU‑bunden OCR kan du omsluta `ocr_batch` i en `concurrent.futures.ThreadPoolExecutor`. Kom bara ihåg att behålla en enda `ai`‑instans – att skapa många trådar som var och en anropar `ai.initialize` motverkar målet “frigör AI‑resurser”. +- **Felförmåga:** Omslut per‑bild‑loopen i ett `try/except`‑block så att en enda korrupt PNG inte avbryter hela batchen. + +## Slutsats + +Du har nu en **python ocr tutorial** som demonstrerar hur man **load png image**‑filer, utför **batch OCR processing**, och ansvarsfullt hanterar **free AI resources**. Det kompletta, körbara exemplet visar exakt hur man **recognize text from image**‑objekt och rensar upp efteråt, så att du kan kopiera‑klistra in det i dina egna projekt utan att leta efter saknade delar. + +Redo för nästa steg? Prova att byta ut de stub‑ade `aocr`‑ och `ai`‑modulerna mot riktiga bibliotek som `pytesseract` och `torchvision`. Du kan också utöka skriptet för att skriva ut JSON, skicka resultat till en databas eller integrera med en molnlagrings‑bucket. Himlen är gränsen—lycklig kodning! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/swedish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/swedish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..111b869f4 --- /dev/null +++ b/ocr/swedish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: Lär dig hur du kör OCR på en bild och extraherar text med koordinater + med strukturerad OCR‑igenkänning. Steg‑för‑steg Python‑kod inkluderad. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: sv +og_description: Kör OCR på en bild och få text med koordinater med strukturerad OCR‑igenkänning. + Fullt Python‑exempel med förklaringar. +og_title: Kör OCR på bild – Handledning för strukturerad textutvinning +tags: +- OCR +- Python +- Computer Vision +title: Kör OCR på bild – Komplett guide till strukturerad textutvinning +url: /sv/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Run OCR on image – Complete Guide to Structured Text Extraction + +Har du någonsin behövt **run OCR on image**‑filer men varit osäker på hur du behåller de exakta positionerna för varje ord? Du är inte ensam. I många projekt—kvittoskanning, formulärdigitalisering eller UI‑testning—behöver du inte bara råtexten utan även de avgränsningsrutor som visar var varje rad finns på bilden. + +Denna handledning visar dig ett praktiskt sätt att *run OCR on image* med **aocr**‑motorn, begära **structured OCR recognition**, och sedan efterbearbeta resultatet samtidigt som du bevarar geometrin. I slutet kommer du kunna **extract text with coordinates** med bara några rader Python, och du kommer förstå varför strukturerat läge är viktigt för efterföljande uppgifter. + +## What You’ll Learn + +- Hur du initierar OCR‑motorn för **structured OCR recognition**. +- Hur du matar in en bild och får råresultat som inkluderar radgränser. +- Hur du kör en post‑processor som rensar texten utan att förlora geometrin. +- Hur du itererar över de slutgiltiga raderna och skriver ut varje textstycke tillsammans med dess avgränsningsruta. + +Ingen magi, inga dolda steg—bara ett komplett, körbart exempel som du kan släppa in i ditt eget projekt. + +--- + +## Prerequisites + +Innan vi dyker ner, se till att du har följande installerat: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Du behöver också en bildfil (`input_image.png` eller `.jpg`) som innehåller klar, läsbar text. Allt från en skannad faktura till en skärmdump fungerar, så länge OCR‑motorn kan se tecknen. + +--- + +## Step 1: Initialise the OCR engine for structured recognition + +Det första vi gör är att skapa en instans av `aocr.Engine()` och tala om för den att vi vill ha **structured OCR recognition**. Strukturerat läge returnerar inte bara vanlig text utan även geometrisk data (avgränsningsrektanglar) för varje rad, vilket är avgörande när du behöver mappa text tillbaka på bilden. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Why this matters:** +> I standardläget kan motorn bara ge dig en sträng med sammanslagna ord. Strukturerat läge ger dig en hierarki av sidor → rader → ord, var och en med koordinater, vilket gör det mycket enklare att överlagra resultat på originalbilden eller mata in dem i en layout‑medveten modell. + +--- + +## Step 2: Run OCR on the image and obtain raw results + +Nu matar vi bilden till motorn. Anropet `recognize` returnerar ett `OcrResult`‑objekt som innehåller en samling rader, var och en med sin egen avgränsningsruta. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +Vid detta tillfälle innehåller `raw_result.lines` objekt med två viktiga attribut: + +- `text` – den igenkända strängen för den raden. +- `bounds` – en tuple som `(x, y, width, height)` beskriver radens position. + +--- + +## Step 3: Post‑process while preserving geometry + +Rå OCR‑utdata är ofta brusig: lösa tecken, felplacerade mellanslag eller radbrytningsproblem. Funktionen `ai.run_postprocessor` rensar texten men **keeps the original geometry** intakt, så du fortfarande har korrekta koordinater. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Pro tip:** Om du har domänspecifika vokabulärer (t.ex. produktkoder), mata in en anpassad ordlista till post‑processorn för att förbättra noggrannheten. + +--- + +## Step 4: Extract text with coordinates – iterate and display + +Till sist loopar vi över de rensade raderna och skriver ut varje radens avgränsningsruta tillsammans med dess text. Detta är kärnan i **extract text with coordinates**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Expected Output + +Förutsatt att inmatningsbilden innehåller två rader: “Invoice #12345” och “Total: $89.99”, kommer du se något liknande: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +Den första tuplen är `(x, y, width, height)` för raden på originalbilden, vilket låter dig rita rektanglar, markera text eller föra koordinaterna in i ett annat system. + +--- + +## Visualising the Result (Optional) + +Om du vill se avgränsningsrutorna överlagrade på bilden kan du använda Pillow (PIL) för att rita rektanglar. Nedan är ett snabbt kodexempel; hoppa gärna över det om du bara behöver rådata. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image example showing bounding boxes](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +Alt‑texten ovan innehåller **primary keyword**, vilket uppfyller SEO‑kravet för bild‑alt‑attribut. + +--- + +## Why Structured OCR Recognition Beats Simple Text Extraction + +Du kanske undrar, “Kan jag inte bara köra OCR och få texten? Varför bry sig om geometrin?” + +- **Spatial context:** När du behöver mappa fält på ett formulär (t.ex. “Date” bredvid ett datumvärde) visar koordinater var data finns. +- **Multi‑column layouts:** Enkel linjär text förlorar ordning; strukturerad data bevarar kolumnordning. +- **Post‑processing accuracy:** Att känna till boxens storlek hjälper dig avgöra om ett ord är en rubrik, en fotnot eller ett löst artefakt. + +Kort sagt ger **structured OCR recognition** dig flexibiliteten att bygga smartare pipelines—oavsett om du matar data i en databas, skapar sökbara PDF‑filer eller tränar en maskininlärningsmodell som respekterar layout. + +--- + +## Common Edge Cases and How to Handle Them + +| Situation | What to Watch For | Suggested Fix | +|-----------|-------------------|---------------| +| **Rotated or skewed images** | Bounding boxes may be off‑axis. | Pre‑process with deskewing (e.g., OpenCV’s `warpAffine`). | +| **Very small fonts** | Engine may miss characters, leading to empty lines. | Increase image resolution or use `ocr_engine.set_dpi(300)`. | +| **Mixed languages** | Wrong language model can cause garbled text. | Set `ocr_engine.language = ["en", "de"]` before recognition. | +| **Overlapping boxes** | Post‑processor might merge two lines unintentionally. | Verify `line.bounds` after processing; adjust thresholds in `ai.run_postprocessor`. | + +Att hantera dessa scenarier tidigt sparar dig huvudvärk senare, särskilt när du skalar lösningen till hundratals dokument per dag. + +--- + +## Full End‑to‑End Script + +Nedan är det kompletta, färdiga programmet som binder ihop alla steg. Kopiera‑klistra, justera bildvägen, så är du klar. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Att köra detta skript kommer att: + +1. **Run OCR on image** med strukturerat läge. +2. **Extract text with coordinates** för varje rad. +3. Eventuellt producera en annoterad PNG som visar rutorna. + +--- + +## Conclusion + +Du har nu en solid, självständig lösning för att **run OCR on image** och **extract text with coordinates** med **structured OCR recognition**. Koden demonstrerar varje steg—from engine initialisation to post‑processing and visual verification—så att du kan anpassa den till kvitton, formulär eller vilket visuellt dokument som helst som kräver exakt textlokalisering. + +Vad blir nästa steg? Prova att byta ut `aocr`‑motorn mot ett annat bibliotek (Tesseract, EasyOCR) och se hur deras strukturerade utskrifter skiljer sig. Experimentera med olika efterbearbetningsstrategier, såsom stavningskontroll eller anpassade regex‑filter, för att öka noggrannheten för ditt område. Och om du bygger en större pipeline, överväg att lagra `(text, bounds)`‑par i en databas för senare analys. + +Happy coding, and may your OCR projects be ever accurate! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/thai/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/thai/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..b091005ca --- /dev/null +++ b/ocr/thai/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,210 @@ +--- +category: general +date: 2026-05-03 +description: ดึงข้อความจากภาพโดยใช้ Aspose OCR และการตรวจสอบการสะกดด้วย AI เรียนรู้วิธีทำ + OCR กับภาพ โหลดภาพสำหรับ OCR แยกข้อความจากใบแจ้งหนี้และปล่อยทรัพยากร GPU +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: th +og_description: สกัดข้อความจากภาพด้วย Aspose OCR และการตรวจสอบการสะกดด้วย AI คู่มือแบบขั้นตอนต่อขั้นตอนที่ครอบคลุมวิธี + OCR ภาพ, โหลดภาพสำหรับ OCR, และการปล่อยทรัพยากร GPU. +og_title: ดึงข้อความจากภาพ – คู่มือ OCR และตรวจสอบการสะกดอย่างครบถ้วน +tags: +- OCR +- Aspose +- AI +- Python +title: ดึงข้อความจากภาพ – OCR ด้วย Aspose AI Spell‑Check +url: /th/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# สกัดข้อความจากรูปภาพ – คู่มือ OCR & การตรวจสอบการสะกดแบบครบถ้วน + +เคยต้องการ **สกัดข้อความจากรูปภาพ** แต่ไม่แน่ใจว่าห้องสมุดใดจะให้ความเร็วและความแม่นยำพร้อมกัน? คุณไม่ได้เป็นคนเดียว ในหลายโครงการจริง—เช่นการประมวลผลใบแจ้งหนี้, การแปลงใบเสร็จเป็นดิจิทัล, หรือการสแกนสัญญา—การได้ข้อความที่สะอาดและค้นหาได้จากภาพเป็นอุปสรรคแรก. + +ข่าวดีคือ Aspose OCR ที่จับคู่กับโมเดล Aspose AI ที่มีน้ำหนักเบาสามารถทำงานนี้ได้ในไม่กี่บรรทัดของ Python ในบทแนะนำนี้เราจะอธิบาย **วิธี OCR รูปภาพ**, โหลดภาพอย่างถูกต้อง, รัน post‑processor ตรวจสอบการสะกดในตัว, และสุดท้าย **ปล่อยทรัพยากร GPU** เพื่อให้แอปของคุณเป็นมิตรกับหน่วยความจำ. + +เมื่อจบคู่มือนี้คุณจะสามารถ **จดจำข้อความจากใบแจ้งหนี้** ในรูปภาพ, แก้ไขข้อผิดพลาด OCR ที่พบบ่อยโดยอัตโนมัติ, และทำให้ GPU ของคุณสะอาดสำหรับชุดต่อไป. + +--- + +## สิ่งที่คุณต้องการ + +- Python 3.9 หรือใหม่กว่า (โค้ดใช้ type hints แต่ทำงานได้กับเวอร์ชัน 3.x ก่อนหน้า) +- แพ็กเกจ `aspose-ocr` และ `aspose-ai` (ติดตั้งโดยใช้ `pip install aspose-ocr aspose-ai`) +- GPU ที่รองรับ CUDA เป็นตัวเลือก; สคริปต์จะใช้ CPU หากไม่พบ GPU +- ภาพตัวอย่าง เช่น `sample_invoice.png` ที่วางไว้ในโฟลเดอร์ที่คุณอ้างอิงได้ + +ไม่มีเฟรมเวิร์ก ML ขนาดใหญ่, ไม่มีการดาวน์โหลดโมเดลขนาดมหาศาล—เพียงโมเดล Q4‑K‑M ที่คอมพิวท์แล้วขนาดเล็กที่พอดีกับ GPU ส่วนใหญ่. + +## ขั้นตอนที่ 1: เริ่มต้น OCR Engine – สกัดข้อความจากรูปภาพ + +สิ่งแรกที่คุณทำคือสร้างอินสแตนซ์ `OcrEngine` และบอกว่าคุณคาดหวังภาษาอะไร ที่นี่เราเลือก English และขอผลลัพธ์เป็น plain‑text ซึ่งเหมาะสำหรับการประมวลผลต่อไป. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**ทำไมจึงสำคัญ:** การตั้งค่าภาษา จำกัดชุดอักขระ ทำให้ความแม่นยำดีขึ้น โหมด plain‑text จะลบข้อมูลการจัดรูปแบบที่คุณมักไม่ต้องการเมื่อเพียงต้องการสกัดข้อความจากรูปภาพ. + +## ขั้นตอนที่ 2: โหลดภาพสำหรับ OCR – วิธี OCR รูปภาพ + +ตอนนี้เราจะส่งภาพจริงให้กับ engine ตัวช่วย `Image.load` เข้าใจรูปแบบทั่วไป (PNG, JPEG, TIFF) และจัดการความแปลกประหลาดของ file‑IO ให้คุณ. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**เคล็ดลับ:** หากภาพต้นทางของคุณมีขนาดใหญ่ ควรปรับขนาดก่อนส่งให้ engine; มิติที่เล็กลงสามารถลดการใช้หน่วยความจำ GPU โดยไม่ทำให้คุณภาพการจดจำลดลง. + +## ขั้นตอนที่ 3: กำหนดค่า Aspose AI Model – จดจำข้อความจากใบแจ้งหนี้ + +Aspose AI มาพร้อมกับโมเดล GGUF ขนาดเล็กที่คุณสามารถดาวน์โหลดอัตโนมัติ ตัวอย่างใช้รีโพซิทอรี `Qwen2.5‑3B‑Instruct‑GGUF` ที่คอมพิวท์เป็น `q4_k_m` เรายังบอก runtime ให้จัดสรร 20 ชั้นบน GPU เพื่อสมดุลความเร็วและการใช้ VRAM. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**เบื้องหลัง:** โมเดลคอมพิวท์มีขนาดประมาณ 1.5 GB บนดิสก์ เป็นส่วนเล็กของโมเดลความแม่นยำเต็มรูปแบบ แต่ยังคงจับความละเอียดทางภาษาที่เพียงพอเพื่อระบุการสะกดผิดทั่วไปของ OCR. + +## ขั้นตอนที่ 4: เริ่มต้น AsposeAI และแนบ post‑processor ตรวจสอบการสะกด + +Aspose AI มี post‑processor ตรวจสอบการสะกดที่พร้อมใช้ โดยการแนบมัน ผลลัพธ์ OCR ทุกอย่างจะถูกทำความสะอาดโดยอัตโนมัติ. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**ทำไมต้องใช้ post‑processor?** เครื่องมือ OCR มักอ่าน “Invoice” เป็น “Invo1ce” หรือ “Total” เป็น “T0tal” การตรวจสอบการสะกดจะรันโมเดลภาษาน้ำหนักเบาบนสตริงดิบและแก้ไขข้อผิดพลาดเหล่านั้นโดยที่คุณไม่ต้องเขียนพจนานุกรมเอง. + +## ขั้นตอนที่ 5: รัน post‑processor ตรวจสอบการสะกดบนผลลัพธ์ OCR + +เมื่อทุกอย่างเชื่อมต่อแล้ว การเรียกครั้งเดียวจะให้ข้อความที่แก้ไขแล้ว เรายังพิมพ์ทั้งเวอร์ชันดั้งเดิมและเวอร์ชันที่ทำความสะอาดเพื่อให้คุณเห็นการปรับปรุง. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +ผลลัพธ์ทั่วไปสำหรับใบแจ้งหนี้อาจมีลักษณะดังนี้: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +สังเกตว่า “Invo1ce” ถูกเปลี่ยนเป็นคำที่ถูกต้อง “Invoice” นั่นคือพลังของการตรวจสอบการสะกด AI ในตัว. + +## ขั้นตอนที่ 6: ปล่อยทรัพยากร GPU – ปล่อยทรัพยากร GPU อย่างปลอดภัย + +หากคุณรันสคริปต์นี้ในบริการที่ทำงานต่อเนื่อง (เช่นเว็บ API ที่ประมวลผลหลายสิบใบแจ้งหนี้ต่อวินาที) คุณต้องปล่อยคอนเท็กซ์ GPU หลังจากแต่ละชุด มิฉะนั้นจะเกิดการรั่วหน่วยความจำและในที่สุดจะเจอข้อผิดพลาด “CUDA out of memory”. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**เคล็ดลับมืออาชีพ:** เรียก `free_resources()` ภายในบล็อก `finally` หรือ context manager เพื่อให้มันทำงานเสมอ แม้เกิดข้อยกเว้น. + +## ตัวอย่างทำงานเต็มรูปแบบ + +การรวมส่วนต่าง ๆ เข้าด้วยกันให้คุณได้สคริปต์ที่เป็นอิสระที่สามารถใส่ลงในโปรเจกต์ใดก็ได้. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +บันทึกไฟล์, ปรับเส้นทางไปยังภาพของคุณ, และรัน `python extract_text_from_image.py` คุณควรเห็นข้อความใบแจ้งหนี้ที่ทำความสะอาดแล้วแสดงบนคอนโซล. + +## คำถามที่พบบ่อย (FAQ) + +**Q: ทำงานบนเครื่องที่มีเฉพาะ CPU หรือไม่?** +A: แน่นอน หากไม่พบ GPU Aspose AI จะย้อนกลับไปใช้ CPU แม้ว่าจะช้ากว่า คุณสามารถบังคับใช้ CPU ได้โดยตั้งค่า `model_cfg.gpu_layers = 0`. + +**Q: ถ้าใบแจ้งหนี้ของฉันเป็นภาษาอื่นที่ไม่ใช่ English จะทำอย่างไร?** +A: เปลี่ยน `ocr_engine.language` เป็นค่า enum ที่เหมาะสม (เช่น `aocr.Language.Spanish`). โมเดลตรวจสอบการสะกดเป็นหลายภาษา แต่คุณอาจได้ผลลัพธ์ที่ดีกว่าด้วยโมเดลเฉพาะภาษา. + +**Q: ฉันสามารถประมวลผลหลายภาพในลูปได้หรือไม่?** +A: ได้ เพียงย้ายขั้นตอนการโหลด, การจดจำ, และ post‑processing เข้าไปในลูป `for` จำไว้ว่าต้องเรียก `ocr_ai.free_resources()` หลังลูปหรือหลังแต่ละชุดหากคุณใช้ AI อินสแตนซ์เดียวกันซ้ำ. + +**Q: ขนาดการดาวน์โหลดโมเดลเท่าไหร่?** +A: ประมาณ 1.5 GB สำหรับเวอร์ชันคอมพิวท์ `q4_k_m` จะถูกแคชหลังจากรันครั้งแรก ดังนั้นการรันต่อมาจะเร็วทันใจ. + +## สรุป + +ในบทแนะนำนี้เราได้สาธิตวิธี **สกัดข้อความจากรูปภาพ** ด้วย Aspose OCR, กำหนดค่าโมเดล AI ขนาดเล็ก, ใช้ post‑processor ตรวจสอบการสะกด, และปล่อย **ทรัพยากร GPU** อย่างปลอดภัย กระบวนการครอบคลุมตั้งแต่การโหลดภาพจนถึงการทำความสะอาดหลังการใช้งาน ให้คุณมี pipeline ที่เชื่อถือได้สำหรับสถานการณ์ **จดจำข้อความจากใบแจ้งหนี้** + +ขั้นตอนต่อไป? ลองเปลี่ยน post‑processor ตรวจสอบการสะกดเป็นโมเดลการสกัดเอนทิตีแบบกำหนดเอง + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/thai/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/thai/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..d8112e078 --- /dev/null +++ b/ocr/thai/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,215 @@ +--- +category: general +date: 2026-05-03 +description: วิธีทำ OCR รูปภาพเป็นชุดโดยใช้ Aspose OCR และการตรวจสอบการสะกดด้วย AI + เรียนรู้วิธีดึงข้อความจากรูปภาพ ใช้การตรวจสอบการสะกด แหล่งทรัพยากร AI ฟรี และแก้ไขข้อผิดพลาดของ + OCR +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: th +og_description: วิธีทำ OCR รูปภาพเป็นชุดโดยใช้ Aspose OCR และการตรวจสอบการสะกดด้วย + AI. ทำตามคู่มือขั้นตอนต่อขั้นตอนเพื่อดึงข้อความจากรูปภาพ, ใช้การตรวจสอบการสะกด, + ใช้ทรัพยากร AI ฟรีและแก้ไขข้อผิดพลาดของ OCR. +og_title: วิธีทำ OCR แบบชุดด้วย Aspose OCR – บทเรียน Python ฉบับสมบูรณ์ +tags: +- OCR +- Python +- AI +- Aspose +title: วิธีทำ OCR แบบกลุ่มด้วย Aspose OCR – คู่มือ Python ฉบับเต็ม +url: /th/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# วิธีทำ Batch OCR ด้วย Aspose OCR – คู่มือ Python ฉบับเต็ม + +เคยสงสัย **วิธีทำ batch OCR** โฟลเดอร์เต็มของไฟล์ PDF หรือรูปสแกนโดยไม่ต้องเขียนสคริปต์แยกสำหรับแต่ละไฟล์หรือไม่? คุณไม่ได้เป็นคนเดียว ในหลาย ๆ pipeline ของโลกจริง คุณจะต้อง **ดึงข้อความจากรูปภาพ**, ทำความสะอาดข้อผิดพลาดการสะกด, และสุดท้ายปล่อยทรัพยากร AI ที่คุณได้จัดสรรไว้ บทแนะนำนี้จะแสดงให้คุณเห็นอย่างชัดเจนว่าจะทำอย่างไรด้วย Aspose OCR, post‑processor AI ที่เบา, และไม่กี่บรรทัดของ Python. + +เราจะอธิบายขั้นตอนการเริ่มต้น OCR engine, เชื่อมต่อ AI spell‑checker, วนลูปผ่านไดเรกทอรีของรูปภาพ, และทำความสะอาดโมเดลหลังจากนั้น. เมื่อเสร็จคุณจะได้สคริปต์พร้อมรันที่ **แก้ไขข้อผิดพลาด OCR** อัตโนมัติและปล่อย **ทรัพยากร AI ฟรี** เพื่อให้ GPU ของคุณทำงานได้อย่างราบรื่น. + +## สิ่งที่คุณต้องการ + +- Python 3.9+ (โค้ดใช้ type‑hints แต่ทำงานได้กับเวอร์ชัน 3.x ก่อนหน้า) +- `asposeocr` package (`pip install asposeocr`) – ให้บริการ OCR engine. +- เข้าถึงโมเดล Hugging Face `bartowski/Qwen2.5-3B-Instruct-GGUF` (ดาวน์โหลดโดยอัตโนมัติ). +- GPU ที่มี VRAM อย่างน้อยหลาย GB (สคริปต์ตั้งค่า `gpu_layers = 30`, คุณสามารถลดได้หากต้องการ). + +ไม่มีบริการภายนอก, ไม่มี API ที่ต้องชำระเงิน – ทุกอย่างทำงานบนเครื่องท้องถิ่น. + +--- + +## ขั้นตอนที่ 1: ตั้งค่า OCR Engine – **วิธีทำ Batch OCR** อย่างมีประสิทธิภาพ + +ก่อนที่เราจะประมวลผลภาพจำนวนพัน เราต้องการ OCR engine ที่มั่นคง Aspose OCR ให้เราสามารถเลือกภาษาและโหมดการจดจำในคำสั่งเดียว. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**ทำไมเรื่องนี้สำคัญ:** การตั้งค่า `recognize_mode` เป็น `Plain` ทำให้ผลลัพธ์มีน้ำหนักเบา ซึ่งเหมาะเมื่อคุณวางแผนจะทำการตรวจสอบการสะกดต่อไป หากคุณต้องการข้อมูลการจัดหน้า คุณจะสลับเป็น `Layout` แต่จะเพิ่มภาระที่คุณอาจไม่ต้องการในงาน batch. + +> **เคล็ดลับ:** หากคุณกำลังจัดการกับการสแกนหลายภาษา คุณสามารถส่งรายการเช่น `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## ขั้นตอนที่ 2: เริ่มต้น AI Post‑Processor – **ใช้การตรวจสอบการสะกด** กับผลลัพธ์ OCR + +Aspose AI มาพร้อมกับ post‑processor ในตัวที่สามารถรันโมเดลใดก็ได้ที่คุณต้องการ ที่นี่เราดึงโมเดล Qwen 2.5 ที่ถูกควอนไทซ์จาก Hugging Face และเชื่อมต่อฟังก์ชันตรวจสอบการสะกด. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**ทำไมเรื่องนี้สำคัญ:** โมเดลถูกควอนไทซ์ (`q4_k_m`) ซึ่งลดการใช้หน่วยความจำอย่างมากในขณะที่ยังให้ความเข้าใจภาษาในระดับที่ดี โดยการเรียก `set_post_processor` เราบอก Aspose AI ให้รันขั้นตอน **apply spell check** โดยอัตโนมัติบนสตริงใด ๆ ที่เราป้อนเข้าไป. + +> **ระวัง:** หาก GPU ของคุณไม่สามารถจัดการ 30 ชั้นได้ ให้ลดจำนวนลงเป็น 15 หรือแม้แต่ 5 – สคริปต์ยังทำงานได้ แต่จะช้าลงเล็กน้อย. + +--- + +## ขั้นตอนที่ 3: รัน OCR และ **แก้ไขข้อผิดพลาด OCR** บนรูปภาพเดียว + +ตอนนี้ OCR engine และ AI spell‑checker พร้อมแล้ว เราจะรวมเข้าด้วยกัน ฟังก์ชันนี้โหลดรูปภาพ, ดึงข้อความดิบ, แล้วรัน AI post‑processor เพื่อทำความสะอาด. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**ทำไมเรื่องนี้สำคัญ:** การส่งสตริง OCR ดิบตรงเข้าสู่โมเดล AI ทำให้เราได้ขั้นตอน **correct OCR errors** โดยไม่ต้องเขียน regex หรือพจนานุกรมกำหนดเอง โมเดลเข้าใจบริบท จึงสามารถแก้ “recieve” → “receive” และข้อผิดพลาดที่ละเอียดอ่อนยิ่งขึ้น. + +--- + +## ขั้นตอนที่ 4: **ดึงข้อความจากรูปภาพ** เป็นจำนวนมาก – ลูป Batch ที่แท้จริง + +นี่คือจุดที่ความมหัศจรรย์ของ **วิธีทำ batch OCR** ปรากฏ เราจะวนลูปผ่านไดเรกทอรี, ข้ามไฟล์ที่ไม่รองรับ, และเขียนผลลัพธ์ที่แก้ไขแล้วแต่ละไฟล์เป็นไฟล์ `.txt`. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### ผลลัพธ์ที่คาดหวัง + +สำหรับรูปภาพที่มีประโยค *“The quick brown fox jumps over the lazzy dog.”* คุณจะเห็นไฟล์ข้อความที่มี: + +``` +The quick brown fox jumps over the lazy dog. +``` + +สังเกตว่า “z” คู่ถูกแก้ไขโดยอัตโนมัติ – นั่นคือ AI spell‑check ทำงาน. + +**ทำไมเรื่องนี้สำคัญ:** ด้วยการสร้างวัตถุ OCR และ AI **ครั้งเดียว** แล้วใช้ซ้ำ เราจะหลีกเลี่ยงภาระการโหลดโมเดลสำหรับแต่ละไฟล์ นี่เป็นวิธีที่มีประสิทธิภาพที่สุดในการ **ทำ batch OCR** ในระดับใหญ่. + +--- + +## ขั้นตอนที่ 5: ทำความสะอาด – **ปล่อยทรัพยากร AI** อย่างถูกต้อง + +เมื่อเสร็จสิ้น การเรียก `free_resources()` จะปล่อยหน่วยความจำ GPU, บริบท CUDA, และไฟล์ชั่วคราวใด ๆ ที่โมเดลสร้างขึ้น. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +การข้ามขั้นตอนนี้อาจทำให้มีการจัดสรร GPU ค้างอยู่ ซึ่งอาจทำให้กระบวนการ Python ถัดไปล่มหรือใช้ VRAM มากเกินไป คิดว่าเป็นส่วน “ปิดไฟ” ของงาน batch. + +--- + +## ข้อผิดพลาดทั่วไป & เคล็ดลับเพิ่มเติม + +| Issue | What to Look For | Fix | +|-------|------------------|-----| +| **ข้อผิดพลาด Out‑of‑memory** | GPU หมดหลังจากประมวลผลหลายสิบภาพ | ลด `gpu_layers` หรือสลับเป็น CPU (`model_cfg.gpu_layers = 0`). | +| **Missing language pack** | OCR คืนค่าเป็นสตริงว่าง | ตรวจสอบว่าเวอร์ชัน `asposeocr` มีข้อมูลภาษาอังกฤษ; ติดตั้งใหม่หากจำเป็น. | +| **ไฟล์ที่ไม่ใช่รูปภาพ** | สคริปต์ล่มเมื่อเจอไฟล์ `.pdf` ที่หลงเหลือ | เงื่อนไข `if not file_name.lower().endswith(...)` จะข้ามไฟล์เหล่านั้นอยู่แล้ว. | +| **Spell‑check ไม่ทำงาน** | ผลลัพธ์เหมือนกับ OCR ดิบ | ตรวจสอบว่าได้เรียก `ai_processor.set_post_processor` ก่อนลูป. | +| **ความเร็ว batch ช้า** | ใช้เวลา >5 วินาทีต่อภาพ | เปิดใช้งาน `model_cfg.allow_auto_download = "false"` หลังการรันครั้งแรก เพื่อไม่ให้โมเดลดาวน์โหลดซ้ำทุกครั้ง. | + +**เคล็ดลับ:** หากคุณต้องการ **ดึงข้อความจากรูปภาพ** ในภาษาที่ไม่ใช่ภาษาอังกฤษ เพียงเปลี่ยน `ocr_engine.language` เป็น enum ที่เหมาะสม (เช่น `aocr.Language.French`). AI post‑processor เดียวกันจะยังคงทำการตรวจสอบการสะกด, แต่คุณอาจต้องการโมเดลเฉพาะภาษาสำหรับผลลัพธ์ที่ดีที่สุด. + +--- + +## สรุป & ขั้นตอนต่อไป + +เราได้ครอบคลุม pipeline ทั้งหมดสำหรับ **วิธีทำ batch OCR**: + +1. **Initialize** OCR engine แบบ plain‑text สำหรับภาษาอังกฤษ. +2. **Configure** โมเดล AI ตรวจสอบการสะกดและผูกเป็น post‑processor. +3. **Run** OCR บนแต่ละภาพและให้ AI **correct OCR errors** โดยอัตโนมัติ. +4. **Loop** ผ่านไดเรกทอรีเพื่อ **ดึงข้อความจากรูปภาพ** จำนวนมาก. +5. **Free AI resources** เมื่องานเสร็จสิ้น. + +จากนี้คุณสามารถ: + +- ส่งข้อความที่แก้ไขแล้วเข้าสู่ pipeline NLP ต่อไป (การวิเคราะห์ความรู้สึก, การสกัดเอนทิตี้, ฯลฯ). +- เปลี่ยน post‑processor ตรวจสอบการสะกดเป็นสรุปแบบกำหนดเองโดยเรียก `ai_processor.set_post_processor(your_custom_func, {})`. +- ทำให้ลูปโฟลเดอร์ทำงานแบบขนานด้วย `concurrent.futures.ThreadPoolExecutor` หาก GPU ของคุณรองรับหลายสตรีม. + +--- + +## ความคิดสุดท้าย + +การทำ Batch OCR ไม่จำเป็นต้องเป็นภาระหนัก ด้วยการใช้ Aspose OCR ร่วมกับโมเดล AI ที่เบา คุณจะได้ **โซลูชันครบวงจร** ที่ **ดึงข้อความจากรูปภาพ**, **ใช้การตรวจสอบการสะกด**, **แก้ไขข้อผิดพลาด OCR**, และ **ปล่อยทรัพยากร AI** อย่างสะอาด ให้สคริปต์ทำงานบนโฟลเดอร์ทดสอบ, ปรับจำนวนชั้น GPU ให้ตรงกับฮาร์ดแวร์ของคุณ, แล้วคุณจะมี pipeline พร้อมใช้งานในไม่กี่นาที. + +มีคำถามเกี่ยวกับการปรับโมเดล, การจัดการ PDF, หรือการรวมเข้ากับบริการเว็บ? แสดงความคิดเห็นด้านล่างหรือทักมาผมบน GitHub. โค้ดดิ้งอย่างสนุกสนาน, และขอให้ OCR ของคุณแม่นยำเสมอ! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/thai/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/thai/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..0cf62055e --- /dev/null +++ b/ocr/thai/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,299 @@ +--- +category: general +date: 2026-05-03 +description: บทเรียน OCR ด้วย Python ที่แสดงวิธีโหลดไฟล์ภาพ PNG, แยกข้อความจากภาพ + และแหล่งทรัพยากร AI ฟรีสำหรับการประมวลผล OCR แบบกลุ่ม +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: th +og_description: บทเรียน OCR ด้วย Python จะพาคุณผ่านขั้นตอนการโหลดภาพ PNG, การจดจำข้อความจากภาพ + และการจัดการทรัพยากร AI ฟรีสำหรับการประมวลผล OCR แบบกลุ่ม +og_title: บทเรียน OCR ด้วย Python – การทำ OCR แบบแบตช์อย่างรวดเร็วด้วยแหล่งทรัพยากร + AI ฟรี +tags: +- OCR +- Python +- AI +title: บทเรียน OCR ด้วย Python – การประมวลผล OCR แบบชุดทำได้ง่าย +url: /th/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# การสอน Python OCR – การประมวลผล OCR แบบกลุ่มอย่างง่าย + +เคยต้องการ **python ocr tutorial** ที่จริงจังและทำให้คุณรัน OCR บนไฟล์ PNG หลายสิบไฟล์โดยไม่ต้องบิดหัวไหม? คุณไม่ได้เป็นคนเดียว ในหลายโครงการจริง ๆ คุณต้อง **load png image** ไฟล์เหล่านั้น, ป้อนให้กับเอนจิน, แล้วทำความสะอาดทรัพยากร AI เมื่อเสร็จสิ้น + +ในคู่มือนี้เราจะเดินผ่านตัวอย่างที่พร้อมรันครบถ้วน ซึ่งแสดงให้เห็นอย่างชัดเจนว่า **recognize text from image** อย่างไร, ประมวลผลเป็นกลุ่ม, และปล่อยหน่วยความจำ AI ที่ใช้ไว้ หลังจากเสร็จสิ้น คุณจะได้สคริปต์ที่สามารถนำไปใช้ในโปรเจกต์ใดก็ได้—ไม่มีส่วนเกิน, มีแค่สิ่งที่จำเป็น + +## สิ่งที่คุณต้องมี + +- Python 3.10 หรือใหม่กว่า (ไวยากรณ์ที่ใช้ในที่นี้พึ่งพา f‑strings และ type hints) +- ไลบรารี OCR ที่มีเมธอด `engine.recognize` – สำหรับการสาธิตเราจะสมมติว่าใช้แพคเกจ `aocr` แต่ว่าคุณสามารถเปลี่ยนเป็น Tesseract, EasyOCR ฯลฯ ได้ +- โมดูลช่วยเหลือ `ai` ที่แสดงในโค้ดสแนป (มันจัดการการเริ่มต้นโมเดลและการทำความสะอาดทรัพยากร) +- โฟลเดอร์ที่เต็มไปด้วยไฟล์ PNG ที่คุณต้องการประมวลผล + +หากคุณยังไม่มี `aocr` หรือ `ai` ติดตั้ง, คุณสามารถจำลองด้วยสตับได้ – ดูส่วน “Optional Stubs” ใกล้ท้ายบทความ + +## ขั้นตอนที่ 1: เริ่มต้น AI Engine (Free AI Resources) + +ก่อนที่คุณจะป้อนรูปภาพใด ๆ เข้าไปใน pipeline ของ OCR, โมเดลพื้นฐานต้องพร้อม การเริ่มต้นเพียงครั้งเดียวช่วยประหยัดหน่วยความจำและเร่งความเร็วของงานแบบกลุ่ม + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**ทำไมจึงสำคัญ:** +การเรียก `ai.initialize` ซ้ำ ๆ สำหรับแต่ละรูปภาพจะทำให้หน่วยความจำ GPU ถูกจัดสรรซ้ำ ๆ จนทำให้สคริปต์ล่มได้ การตรวจสอบ `ai.is_initialized()` ทำให้มั่นใจว่ามีการจัดสรรเพียงครั้งเดียว – นี่คือหลักการ “free AI resources” + +## ขั้นตอนที่ 2: โหลดไฟล์ PNG สำหรับการประมวลผล OCR แบบกลุ่ม + +ต่อไปเราจะรวบรวมไฟล์ PNG ทั้งหมดที่ต้องการรัน OCR การใช้ `pathlib` ทำให้โค้ดทำงานได้บนทุกระบบปฏิบัติการ + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**กรณีขอบ:** +หากโฟลเดอร์มีไฟล์ที่ไม่ใช่ PNG (เช่น JPEG) จะถูกละเว้น, ป้องกันไม่ให้ `engine.recognize` เกิดข้อผิดพลาดจากรูปแบบที่ไม่รองรับ + +## ขั้นตอนที่ 3: รัน OCR บนแต่ละภาพและทำ Post‑Processing + +เมื่อเอนจินพร้อมและรายการไฟล์ถูกเตรียมแล้ว, เราสามารถวนลูปผ่านรูปภาพ, ดึงข้อความดิบ, แล้วส่งให้ post‑processor ที่ทำความสะอาด artefacts ของ OCR (เช่น การขึ้นบรรทัดใหม่ที่ไม่ต้องการ) + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**ทำไมต้องแยกการโหลดออกจากการจดจำ:** +`aocr.Image.load` อาจทำการถอดรหัสแบบ lazy, ซึ่งเร็วกว่าเมื่อจัดการกับชุดข้อมูลขนาดใหญ่ การแยกขั้นตอนการโหลดทำให้ง่ายต่อการสลับไลบรารีรูปภาพอื่น ๆ หากคุณต้องการรองรับ JPEG หรือ TIFF ในอนาคต + +## ขั้นตอนที่ 4: ทำความสะอาด – ปล่อยทรัพยากร AI หลังจากทำงานเป็นกลุ่ม + +เมื่อการประมวลผลกลุ่มเสร็จสิ้น, เราต้องปล่อยโมเดลเพื่อหลีกเลี่ยงการรั่วของหน่วยความจำ, โดยเฉพาะบนเครื่องที่เปิดใช้ GPU + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## รวมทุกอย่างเข้าด้วยกัน – สคริปต์เต็มรูปแบบ + +ด้านล่างเป็นไฟล์เดียวที่เชื่อมขั้นตอนสี่ขั้นตอนเข้าด้วยกันเป็น workflow ที่ต่อเนื่อง บันทึกเป็น `batch_ocr.py` แล้วรันจาก command line + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### ผลลัพธ์ที่คาดหวัง + +การรันสคริปต์กับโฟลเดอร์ที่มี PNG สามไฟล์อาจพิมพ์: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +ไฟล์ `ocr_results.txt` จะมีตัวแบ่งที่ชัดเจนสำหรับแต่ละภาพตามด้วยข้อความ OCR ที่ทำความสะอาดแล้ว + +## สตับแบบเลือกใช้สำหรับ aocr & ai (หากคุณไม่มีแพคเกจจริง) + +หากคุณต้องการทดสอบ flow โดยไม่ต้องดึงไลบรารี OCR ขนาดใหญ่, คุณสามารถสร้างโมดูล mock ขั้นพื้นฐานได้: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +วางโฟลเดอร์เหล่านี้ไว้ข้าง `batch_ocr.py` แล้วสคริปต์จะทำงาน, พิมพ์ผลลัพธ์ mock + +## เคล็ดลับระดับมืออาชีพ & จุดบกพร่องที่พบบ่อย + +- **การกระเด็นของหน่วยความจำ:** หากคุณประมวลผล PNG ความละเอียดสูงหลายพันไฟล์, ควรปรับขนาดภาพก่อน OCR. `aocr.Image.load` มักรับอาร์กิวเมนต์ `max_size` +- **การจัดการ Unicode:** เปิดไฟล์ผลลัพธ์ด้วย `encoding="utf-8"` เสมอ; เอนจิน OCR สามารถส่งอักขระที่ไม่ใช่ ASCII ได้ +- **การทำงานแบบขนาน:** สำหรับ OCR ที่ใช้ CPU คุณสามารถห่อ `ocr_batch` ด้วย `concurrent.futures.ThreadPoolExecutor`. จำไว้ว่าให้ใช้อินสแตนซ์ `ai` เพียงหนึ่งเดียว – การสร้างหลายเธรดที่แต่ละอันเรียก `ai.initialize` จะทำลายเป้าหมาย “free AI resources” +- **ความทนทานต่อข้อผิดพลาด:** ห่อการวนลูปต่อภาพด้วย `try/except` เพื่อให้ PNG ที่เสียหายหนึ่งไฟล์ไม่ทำให้การประมวลผลทั้งหมดหยุด + +## สรุป + +ตอนนี้คุณมี **python ocr tutorial** ที่แสดงวิธี **load png image** ไฟล์, ทำ **batch OCR processing**, และจัดการ **free AI resources** อย่างรับผิดชอบ ตัวอย่างที่ครบถ้วนและรันได้แสดงให้เห็นอย่างชัดเจนว่า **recognize text from image** อย่างไรและทำความสะอาดหลังจากนั้น, เพื่อให้คุณคัดลอก‑วางเข้าโปรเจกต์ของคุณได้โดยไม่ต้องค้นหาโค้ดที่ขาดหาย + +พร้อมก้าวต่อไปหรือยัง? ลองสลับโมดูล `aocr` และ `ai` stub ด้วยไลบรารีจริงเช่น `pytesseract` และ `torchvision`. คุณยังสามารถขยายสคริปต์ให้ส่งออกเป็น JSON, ผลักดันผลลัพธ์ไปยังฐานข้อมูล, หรือผสานกับ bucket ของคลาวด์ได้ ไม่จำกัดอะไร—ขอให้สนุกกับการเขียนโค้ด! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/thai/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/thai/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..6eea313f8 --- /dev/null +++ b/ocr/thai/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: เรียนรู้วิธีทำ OCR บนภาพและดึงข้อความพร้อมพิกัดโดยใช้การจดจำ OCR แบบโครงสร้าง + พร้อมโค้ด Python ทีละขั้นตอน. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: th +og_description: ทำ OCR บนภาพและรับข้อความพร้อมพิกัดโดยใช้การจดจำ OCR แบบโครงสร้าง + ตัวอย่าง Python เต็มรูปแบบพร้อมคำอธิบาย +og_title: เรียกใช้ OCR บนภาพ – บทเรียนการสกัดข้อความแบบมีโครงสร้าง +tags: +- OCR +- Python +- Computer Vision +title: ทำ OCR บนภาพ – คู่มือฉบับสมบูรณ์สำหรับการสกัดข้อความเชิงโครงสร้าง +url: /th/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Run OCR on image – Complete Guide to Structured Text Extraction + +เคยต้อง **run OCR on image** ไฟล์แต่ไม่แน่ใจว่าจะเก็บตำแหน่งของแต่ละคำได้อย่างแม่นยำหรือไม่? คุณไม่ได้อยู่คนเดียว ในหลายโครงการ—เช่นการสแกนใบเสร็จ, การแปลงฟอร์มเป็นดิจิทัล, หรือการทดสอบ UI—คุณต้องการไม่เพียงแต่ข้อความดิบเท่านั้น แต่ยังต้องการ bounding box ที่บอกตำแหน่งของแต่ละบรรทัดบนภาพด้วย + +บทแนะนำนี้จะแสดงวิธีปฏิบัติจริงเพื่อ *run OCR on image* ด้วย **aocr** engine, ขอ **structured OCR recognition**, แล้วทำ post‑process ผลลัพธ์โดยคงรูปทรงเรขาคณิตไว้ ด้วยขั้นตอนไม่กี่บรรทัดของ Python คุณจะสามารถ **extract text with coordinates** ได้ และเข้าใจว่าทำไมโหมด structured ถึงสำคัญสำหรับงานต่อไป + +## What You’ll Learn + +- วิธี initialise OCR engine สำหรับ **structured OCR recognition** +- วิธีป้อนภาพและรับผลลัพธ์ดิบที่รวม line bounds +- วิธีรัน post‑processor ที่ทำความสะอาดข้อความโดยไม่สูญเสีย geometry +- วิธีวนลูปผ่านบรรทัดสุดท้ายและพิมพ์ข้อความพร้อม bounding box + +ไม่มีมายากล ไม่มีขั้นตอนลับ—เพียงตัวอย่างที่ทำงานได้ครบถ้วนที่คุณสามารถนำไปใช้ในโปรเจกต์ของคุณได้ทันที + +--- + +## Prerequisites + +ก่อนที่เราจะเริ่ม โปรดตรวจสอบว่าคุณได้ติดตั้งสิ่งต่อไปนี้แล้ว: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +คุณยังต้องมีไฟล์ภาพ (`input_image.png` หรือ `.jpg`) ที่มีข้อความชัดเจนและอ่านได้ ไม่ว่าจะเป็นใบแจ้งหนี้ที่สแกนหรือภาพหน้าจอใด ๆ ก็ได้ ตราบใดที่ OCR engine มองเห็นอักขระได้ + +--- + +## Step 1: Initialise the OCR engine for structured recognition + +สิ่งแรกที่เราทำคือสร้างอินสแตนซ์ของ `aocr.Engine()` และบอกให้มันทำงานใน **structured OCR recognition** โหมด structured จะคืนค่าไม่เพียงแต่ข้อความธรรมดา แต่ยังรวมข้อมูลเรขาคณิต (bounding rectangles) ของแต่ละบรรทัด ซึ่งจำเป็นเมื่อคุณต้องแมปข้อความกลับไปยังภาพ + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Why this matters:** +> ในโหมดปกติ engine อาจให้คุณแค่สตริงของคำที่ต่อเนื่องกันเท่านั้น โหมด structured ให้คุณได้โครงสร้างหน้า → บรรทัด → คำ แต่ละระดับมีพิกัด ทำให้ง่ายต่อการวางผลลัพธ์บนภาพต้นฉบับหรือส่งต่อไปยังโมเดลที่รับรู้ layout + +--- + +## Step 2: Run OCR on the image and obtain raw results + +ต่อไปเราจะป้อนภาพให้กับ engine การเรียก `recognize` จะคืนค่าเป็นอ็อบเจ็กต์ `OcrResult` ที่มีคอลเลกชันของบรรทัดแต่ละบรรทัด พร้อม bounding rectangle ของมันเอง + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +ในจุดนี้ `raw_result.lines` จะเก็บอ็อบเจ็กต์ที่มีสองแอตทริบิวต์สำคัญ: + +- `text` – สตริงที่ engine จดจำได้สำหรับบรรทัดนั้น +- `bounds` – ทูเพิลรูปแบบ `(x, y, width, height)` ที่บรรยายตำแหน่งของบรรทัด + +--- + +## Step 3: Post‑process while preserving geometry + +ผลลัพธ์ OCR ดิบมักมี noise: ตัวอักษรแปลกปลอม, ช่องว่างผิดตำแหน่ง, หรือปัญหา line‑break ฟังก์ชัน `ai.run_postprocessor` จะทำความสะอาดข้อความแต่ **คง geometry ดั้งเดิม** ไว้ ดังนั้นคุณยังคงมีพิกัดที่แม่นยำ + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Pro tip:** หากคุณมีพจนานุกรมเฉพาะโดเมน (เช่นรหัสสินค้า) ให้ส่งพจนานุกรมแบบกำหนดเองไปยัง post‑processor เพื่อเพิ่มความแม่นยำ + +--- + +## Step 4: Extract text with coordinates – iterate and display + +สุดท้าย เราจะวนลูปผ่านบรรทัดที่ทำความสะอาดแล้ว พิมพ์ bounding box ของแต่ละบรรทัดพร้อมข้อความ นี่คือหัวใจของ **extract text with coordinates** + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Expected Output + +สมมติว่าภาพอินพุตมีสองบรรทัด: “Invoice #12345” และ “Total: $89.99” คุณจะเห็นผลลัพธ์ประมาณนี้: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +ทูเพิลแรกคือ `(x, y, width, height)` ของบรรทัดบนภาพต้นฉบับ ทำให้คุณสามารถวาดสี่เหลี่ยม, ไฮไลท์ข้อความ, หรือส่งพิกัดไปยังระบบอื่นได้ + +--- + +## Visualising the Result (Optional) + +หากคุณต้องการดู bounding boxes ที่ซ้อนบนภาพ สามารถใช้ Pillow (PIL) วาดสี่เหลี่ยมได้ ตัวอย่างสคริปต์สั้น ๆ ด้านล่างนี้ หากคุณต้องการเพียงข้อมูลดิบก็สามารถข้ามขั้นตอนนี้ได้ + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image example showing bounding boxes](/images/ocr-bounding-boxes.png "run OCR on image – bounding box overlay") + +ข้อความ alt ด้านบนมี **primary keyword** อยู่แล้ว เพื่อตรงตามข้อกำหนด SEO ของ attribute alt รูปภาพ + +--- + +## Why Structured OCR Recognition Beats Simple Text Extraction + +คุณอาจสงสัยว่า “ทำไมต้องใช้ OCR แล้วต้องการ geometry ด้วย? ไม่ได้แค่ต้องการข้อความอย่างเดียวหรอกหรือ?” + +- **Spatial context:** เมื่อคุณต้องแมปฟิลด์บนฟอร์ม (เช่น “Date” ข้างค่าที่เป็นวันที่) พิกัดบอกว่า *ที่ไหน* ข้อมูลนั้นอยู่ +- **Multi‑column layouts:** ข้อความเชิงเส้นธรรมดาจะสูญเสียลำดับคอลัมน์; ข้อมูลแบบ structured รักษาลำดับคอลัมน์ไว้ +- **Post‑processing accuracy:** การรู้ขนาดกล่องช่วยให้คุณตัดสินใจได้ว่าคำนั้นเป็นหัวข้อ, หมายเหตุ, หรือ artefact ที่ไม่ต้องการ + +สรุปแล้ว **structured OCR recognition** ให้ความยืดหยุ่นในการสร้าง pipeline ที่ฉลาดกว่า—ไม่ว่าจะเป็นการบันทึกข้อมูลลงฐานข้อมูล, สร้าง PDF ที่ค้นหาได้, หรือฝึกโมเดล machine‑learning ที่เคารพ layout + +--- + +## Common Edge Cases and How to Handle Them + +| Situation | What to Watch For | Suggested Fix | +|-----------|-------------------|---------------| +| **Rotated or skewed images** | Bounding boxes อาจเอียงหรือไม่ตรง | ทำการ pre‑process ด้วย deskewing (เช่น `warpAffine` ของ OpenCV) | +| **Very small fonts** | Engine อาจพลาดอักขระ ทำให้บรรทัดว่าง | เพิ่มความละเอียดของภาพหรือใช้ `ocr_engine.set_dpi(300)` | +| **Mixed languages** | โมเดลภาษาไม่ตรงทำให้ข้อความเสีย | ตั้งค่า `ocr_engine.language = ["en", "de"]` ก่อนทำการ recognization | +| **Overlapping boxes** | Post‑processor อาจรวมสองบรรทัดโดยไม่ตั้งใจ | ตรวจสอบ `line.bounds` หลังการประมวลผล; ปรับ thresholds ใน `ai.run_postprocessor` | + +การจัดการกับกรณีเหล่านี้ตั้งแต่ต้นจะช่วยลดปัญหาเมื่อคุณขยายโซลูชันไปยังเอกสารหลายร้อยฉบับต่อวัน + +--- + +## Full End‑to‑End Script + +ด้านล่างเป็นโปรแกรมเต็มที่พร้อมรัน ซึ่งรวมทุกขั้นตอนเข้าด้วยกัน คัดลอก‑วาง, ปรับเส้นทางภาพ, แล้วคุณก็พร้อมใช้งาน + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +การรันสคริปต์นี้จะ: + +1. **Run OCR on image** ด้วยโหมด structured +2. **Extract text with coordinates** สำหรับทุกบรรทัด +3. ตัวเลือกสร้าง PNG ที่มี annotation แสดงกล่อง + +--- + +## Conclusion + +ตอนนี้คุณมีโซลูชันครบวงจรเพื่อ **run OCR on image** และ **extract text with coordinates** ด้วย **structured OCR recognition** โค้ดแสดงทุกขั้นตอน—from การ initialise engine ไปจนถึง post‑processing และการตรวจสอบด้วยภาพ—เพื่อให้คุณปรับใช้กับใบเสร็จ, ฟอร์ม, หรือเอกสารภาพใด ๆ ที่ต้องการการระบุตำแหน่งข้อความอย่างแม่นยำ + +ต่อไปคุณอาจลองสลับ `aocr` engine กับไลบรารีอื่น (Tesseract, EasyOCR) แล้วเปรียบเทียบผลลัพธ์ structured ของพวกมัน ทดลองกลยุทธ์ post‑processing ต่าง ๆ เช่นการตรวจสอบการสะกดหรือ regex เฉพาะโดเมน เพื่อเพิ่มความแม่นยำสำหรับงานของคุณ และหากคุณกำลังสร้าง pipeline ขนาดใหญ่ คิดถึงการเก็บคู่ `(text, bounds)` ลงฐานข้อมูลเพื่อการวิเคราะห์ต่อไป + +Happy coding, and may your OCR projects be ever accurate! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/turkish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/turkish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..c399b79ca --- /dev/null +++ b/ocr/turkish/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,230 @@ +--- +category: general +date: 2026-05-03 +description: Aspose OCR ve AI imla kontrolü kullanarak görüntüden metin çıkarın. Görüntüyü + OCR ile nasıl işleyebileceğinizi, OCR için görüntüyü nasıl yükleyeceğinizi, faturadan + metni nasıl tanıyacağınızı ve GPU kaynaklarını nasıl serbest bırakacağınızı öğrenin. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: tr +og_description: Aspose OCR ve AI imla kontrolü ile görüntüden metin çıkarın. Görüntüyü + OCR'lamak, OCR için görüntüyü yüklemek ve GPU kaynaklarını serbest bırakmak hakkında + adım adım rehber. +og_title: Resimden metin çıkarma – Tam OCR ve Yazım Denetimi Rehberi +tags: +- OCR +- Aspose +- AI +- Python +title: görüntüden metin çıkarma – Aspose AI Yazım Denetimi ile OCR +url: /tr/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# görüntüden metin çıkarma – Tam OCR ve Yazım‑Denetimi Rehberi + +Hiç **görüntüden metin çıkarma** ihtiyacı duydunuz ama hangi kütüphanenin hem hız hem de doğruluk sağlayacağını bilmiyor muydunuz? Tek başınıza değilsiniz. Gerçek dünyadaki birçok projede—fatura işleme, makbuz dijitalleştirme veya sözleşme tarama gibi—bir resimden temiz, aranabilir metin elde etmek ilk engeldir. + +İyi haber şu ki Aspose OCR, hafif bir Aspose AI modeliyle birleştirildiğinde bu işi birkaç Python satırıyla halledebilir. Bu öğreticide **görüntüyü OCR nasıl yapılır** konusunu adım adım inceleyecek, resmi doğru şekilde yükleyecek, yerleşik bir yazım denetimi sonrası işleyiciyi çalıştıracak ve sonunda **GPU kaynaklarını serbest bırakacağız** böylece uygulamanız bellek dostu kalır. + +Bu rehberin sonunda **fatura** görüntülerinden metin tanıyabilecek, yaygın OCR hatalarını otomatik olarak düzeltebilecek ve bir sonraki toplu işlem için GPU'nuzu temiz tutabileceksiniz. + +--- + +## Gereksinimler + +- Python 3.9 ve üzeri (kod tip ipuçları kullanıyor ancak daha eski 3.x sürümlerinde de çalışır) +- `aspose-ocr` ve `aspose-ai` paketleri (kurulum için `pip install aspose-ocr aspose-ai` komutunu kullanın) +- CUDA‑destekli bir GPU isteğe bağlıdır; script GPU bulunamazsa CPU'ya geçer. +- Örnek bir görüntü, örn. `sample_invoice.png`, referans alabileceğiniz bir klasöre yerleştirilmiş. + +Ağır ML çerçeveleri yok, büyük model indirmeleri yok—sadece çoğu GPU'ya rahatça sığan küçük bir Q4‑K‑M kuantize modeli. + +--- + +## Adım 1: OCR Motorunu Başlatma – görüntüden metin çıkarma + +İlk olarak bir `OcrEngine` örneği oluşturur ve hangi dili beklediğinizi belirtirsiniz. Burada İngilizce'yi seçiyoruz ve düz‑metin çıktısı istiyoruz; bu, sonraki işlemler için idealdir. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Neden önemli?**: Dili ayarlamak karakter kümesini daraltır, doğruluğu artırır. Düz‑metin modu, genellikle sadece görüntüden metin çıkarmak istediğinizde ihtiyacınız olmayan düzen bilgilerini kaldırır. + +--- + +## Adım 2: OCR için Görüntüyü Yükleme – görüntüyü OCR nasıl yapılır + +Şimdi motora gerçek bir resim veriyoruz. `Image.load` yardımcı fonksiyonu yaygın formatları (PNG, JPEG, TIFF) anlar ve dosya‑IO inceliklerini soyutlar. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**İpucu:** Kaynak görüntüleriniz büyükse, motora göndermeden önce yeniden boyutlandırmayı düşünün; daha küçük boyutlar tanıma kalitesini etkilemeden GPU bellek kullanımını azaltabilir. + +--- + +## Adım 3: Aspose AI Modelini Yapılandırma – faturadan metin tanıma + +Aspose AI, otomatik olarak indirebileceğiniz küçük bir GGUF modeliyle gelir. Örnek, `Qwen2.5‑3B‑Instruct‑GGUF` deposunu `q4_k_m` olarak kuantize edilmiş şekilde kullanır. Ayrıca çalışma zamanına GPU'da 20 katman ayırmasını söylüyoruz; bu, hız ve VRAM kullanımını dengeler. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Arka planda:** Kuantize model yaklaşık 1.5 GB disk alanı kaplar, tam hassasiyetli bir modelin bir kısmıdır, ancak yine de tipik OCR yazım hatalarını işaretleyecek kadar dil nüansını yakalar. + +--- + +## Adım 4: AsposeAI'yi Başlatma ve Yazım‑Denetimi Son İşlemcisini Eklemek + +Aspose AI, hazır bir yazım‑denetimi son işlemci içerir. Bunu ekleyerek, her OCR sonucu otomatik olarak temizlenir. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Neden son işlemci kullanmalı?** OCR motorları sık sık “Invoice” kelimesini “Invo1ce”, “Total” kelimesini “T0tal” gibi okur. Yazım‑denetimi, ham dize üzerinde hafif bir dil modeli çalıştırarak bu hataları, özel bir sözlük yazmadan düzeltir. + +--- + +## Adım 5: OCR Sonucunda Yazım‑Denetimi Son İşlemcisini Çalıştırma + +Her şey bağlandıktan sonra, tek bir çağrı düzeltilmiş metni verir. Ayrıca orijinal ve temizlenmiş sürümleri de yazdırıyoruz, böylece iyileşmeyi görebilirsiniz. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Bir fatura için tipik çıktı şöyle görünebilir: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +“Invo1ce” kelimesinin doğru “Invoice” kelimesine dönüştüğüne dikkat edin. Bu, yerleşik AI yazım‑denetiminin gücüdür. + +--- + +## Adım 6: GPU Kaynaklarını Serbest Bırakma – GPU kaynaklarını güvenli bir şekilde serbest bırakma + +Bunu uzun ömürlü bir hizmette (örneğin, dakikada onlarca fatura işleyen bir web API) çalıştırıyorsanız, her toplu işlemden sonra GPU bağlamını serbest bırakmalısınız. Aksi takdirde bellek sızıntıları görür ve sonunda “CUDA out of memory” hataları alırsınız. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Pro ipucu:** `free_resources()` fonksiyonunu bir `finally` bloğu içinde veya bir bağlam yöneticisiyle çağırın; böylece bir istisna oluşsa bile her zaman çalışır. + +--- + +## Tam Çalışan Örnek + +Tüm parçaları bir araya getirdiğinizde, herhangi bir projeye ekleyebileceğiniz bağımsız bir script elde edersiniz. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Dosyayı kaydedin, görüntü yolunu ayarlayın ve `python extract_text_from_image.py` komutunu çalıştırın. Temizlenmiş fatura metninin konsola yazdırıldığını görmelisiniz. + +--- + +## Sıkça Sorulan Sorular (SSS) + +**S: Bu sadece CPU'lu makinelerde çalışır mı?** +C: Kesinlikle. GPU algılanmazsa, Aspose AI CPU çalıştırmaya geçer, ancak daha yavaş olur. `model_cfg.gpu_layers = 0` ayarlayarak CPU'yu zorlayabilirsiniz. + +**S: Faturalarım İngilizce dışındaki bir dilde olursa ne olur?** +C: `ocr_engine.language` değerini uygun enum değeriyle değiştirin (örneğin, `aocr.Language.Spanish`). Yazım‑denetimi modeli çok dilli, ancak dil‑özel bir modelle daha iyi sonuçlar alabilirsiniz. + +**S: Bir döngüde birden fazla görüntüyü işleyebilir miyim?** +C: Evet. Yükleme, tanıma ve son‑işlem adımlarını bir `for` döngüsü içine taşıyın. Aynı AI örneğini yeniden kullanıyorsanız, döngüden sonra veya her toplu işlemden sonra `ocr_ai.free_resources()` çağırmayı unutmayın. + +**S: Model indirmesi ne kadar büyük?** +C: Kuantize `q4_k_m` versiyonu yaklaşık 1.5 GB. İlk çalıştırmadan sonra önbelleğe alınır, böylece sonraki çalıştırmalar anında gerçekleşir. + +--- + +## Sonuç + +Bu öğreticide Aspose OCR kullanarak **görüntüden metin çıkarma**, küçük bir AI modeli yapılandırma, yazım‑denetimi son işlemcisini uygulama ve güvenli bir şekilde **GPU kaynaklarını serbest bırakma** nasıl yapılır gösterdik. İş akışı, resmi yüklemekten temizlik yapmaya kadar her şeyi kapsar ve **faturadan metin tanıma** senaryoları için güvenilir bir boru hattı sunar. + +Sonraki adımlar? Yazım‑denetimini özel bir varlık‑çıkarma modeliyle değiştirmeyi deneyin + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/turkish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/turkish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..86d5d0c7b --- /dev/null +++ b/ocr/turkish/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,215 @@ +--- +category: general +date: 2026-05-03 +description: Aspose OCR ve AI imla kontrolü kullanarak toplu OCR görüntüleri nasıl + işlenir. Görüntülerden metin çıkarmayı, imla kontrolü uygulamayı, ücretsiz AI kaynaklarından + yararlanmayı ve OCR hatalarını düzeltmeyi öğrenin. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: tr +og_description: Aspose OCR ve AI yazım denetimi kullanarak toplu OCR görüntüleri nasıl + işlenir. Görüntülerden metin çıkarmak, yazım denetimi uygulamak, ücretsiz AI kaynaklarından + yararlanmak ve OCR hatalarını düzeltmek için adım adım kılavuzu izleyin. +og_title: Aspose OCR ile Toplu OCR Nasıl Yapılır – Tam Python Eğitimi +tags: +- OCR +- Python +- AI +- Aspose +title: Aspose OCR ile Toplu OCR Nasıl Yapılır – Tam Python Rehberi +url: /tr/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Aspose OCR ile Toplu OCR Nasıl Yapılır – Tam Python Rehberi + +Hiç **toplu OCR** işlemini, her dosya için ayrı bir betik yazmadan, taranmış PDF’ler ya da fotoğraflardan oluşan bir klasörü işlemek zorunda kaldınız mı? Yalnız değilsiniz. Gerçek dünyadaki birçok işlem hattında **görüntülerden metin çıkarma**, yazım hatalarını temizleme ve sonunda tahsis ettiğiniz AI kaynaklarını serbest bırakma ihtiyacınız olacak. Bu öğretici, Aspose OCR, hafif bir AI post‑işlemcisi ve birkaç Python satırıyla bunu tam olarak nasıl yapacağınızı gösteriyor. + +OCR motorunu başlatma, bir AI yazım denetleyicisi bağlama, bir klasördeki resimleri döngüye sokma ve ardından modeli temizleme adımlarını birlikte inceleyeceğiz. Sonunda **OCR hatalarını otomatik olarak düzelten** ve **AI kaynaklarını serbest bırakan** hazır bir betiğe sahip olacaksınız, böylece GPU’nuz mutlu kalacak. + +## Gereksinimler + +- Python 3.9+ (kod tip ipuçları içeriyor ancak daha eski 3.x sürümlerinde de çalışır) +- `asposeocr` paketi (`pip install asposeocr`) – OCR motorunu sağlar. +- Hugging Face modeli `bartowski/Qwen2.5-3B-Instruct-GGUF` erişimi (otomatik indirilir). +- En az birkaç GB VRAM’a sahip bir GPU (betik `gpu_layers = 30` olarak ayarlar, gerekirse azaltabilirsiniz). + +Harici servis yok, ücretli API yok – her şey yerel olarak çalışır. + +--- + +## Adım 1: OCR Motorunu Kur – **Toplu OCR** için Verimli Ayarlar + +Binlerce resmi işlemeye başlamadan önce sağlam bir OCR motoruna ihtiyacımız var. Aspose OCR, tek bir çağrıda dil ve tanıma modunu seçmemize izin veriyor. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Neden önemli:** `recognize_mode` değerini `Plain` olarak ayarlamak, çıktıyı hafif tutar; bu, daha sonra bir yazım denetimi çalıştırmayı planladığınızda idealdir. Düzen bilgisine ihtiyacınız olursa `Layout`'a geçebilirsiniz, ancak bu toplu işlerde genellikle istenmeyen bir ek yük getirir. + +> **Pro ipucu:** Çok dilli taramalarla uğraşıyorsanız, `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]` gibi bir liste geçirebilirsiniz. + +--- + +## Adım 2: AI Post‑İşlemcisini Başlat – OCR Çıktısına **Yazım Denetimi Uygula** + +Aspose AI, istediğiniz modeli çalıştırabilen yerleşik bir post‑işlemci sunar. Burada Hugging Face’dan kuantize bir Qwen 2.5 modelini çekiyor ve yazım denetimi rutinini bağlıyoruz. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Neden önemli:** Model `q4_k_m` olarak kuantize edilmiştir; bu, bellek kullanımını büyük ölçüde azaltırken hâlâ makul bir dil anlayışı sunar. `set_post_processor` çağrısı, Aspose AI’ye beslediğimiz her dize üzerinde **yazım denetimi uygula** adımını otomatik olarak çalıştırmasını söyler. + +> **Dikkat:** GPU’nuz 30 katmanı kaldıramıyorsa, sayıyı 15 ya da hatta 5’e düşürün – betik hâlâ çalışır, sadece biraz daha yavaş olur. + +--- + +## Adım 3: Tek Bir Resimde OCR Çalıştır ve **OCR Hatalarını Düzelt** + +OCR motoru ve AI yazım denetleyicisi hazır olduğuna göre, bunları birleştiriyoruz. Bu fonksiyon bir resmi yüklüyor, ham metni çıkarıyor ve ardından AI post‑işlemcisini çalıştırarak temizliyor. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Neden önemli:** Ham OCR dizesini doğrudan AI modeline vermek, **OCR hatalarını düzelt** adımını herhangi bir regex ya da özel sözlük yazmadan gerçekleştirir. Model bağlamı anlar, bu yüzden “recieve” → “receive” gibi ve daha ince hataları bile düzeltebilir. + +--- + +## Adım 4: **Görüntülerden Metin Çıkar** Toplu Olarak – Gerçek Toplu Döngü + +İşte **toplu OCR** sihrinin ortaya çıktığı yer. Bir klasörü dolaşıyor, desteklenmeyen dosyaları atlıyor ve her düzeltilmiş çıktıyı bir `.txt` dosyasına yazıyoruz. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Beklenen çıktı + +*“The quick brown fox jumps over the lazzy dog.”* cümlesini içeren bir resim için şu şekilde bir metin dosyası görürsünüz: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Çift “z”nin otomatik olarak düzeltildiğine dikkat edin – bu AI yazım denetiminin etkisi. + +**Neden önemli:** OCR ve AI nesnelerini **bir kez** oluşturup tekrar kullanarak, modeli her dosya için yeniden yükleme yükünden kaçınırız. Bu, ölçekli **toplu OCR** işlemlerinin en verimli yoludur. + +--- + +## Adım 5: Temizleme – **AI Kaynaklarını Serbest Bırak** doğru şekilde + +İşiniz bittiğinde `free_resources()` çağrısı GPU belleğini, CUDA bağlamlarını ve modelin oluşturduğu geçici dosyaları serbest bırakır. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Bu adımı atlamak, GPU tahsislerinin asılı kalmasına yol açabilir; bu da sonraki Python süreçlerinin çökmesine ya da VRAM’in tükenmesine neden olur. Bunu, toplu bir işin “ışıkları kapatma” kısmı olarak düşünün. + +--- + +## Yaygın Tuzaklar & Ek İpuçları + +| Sorun | Dikkat Edilecek | Çözüm | +|-------|------------------|-----| +| **Bellek yetersizliği hataları** | GPU birkaç on resimden sonra doluyor | `gpu_layers` değerini azaltın veya CPU’ya geçin (`model_cfg.gpu_layers = 0`). | +| **Dil paketi eksik** | OCR boş string döndürüyor | `asposeocr` sürümünün İngilizce dil verisini içerdiğinden emin olun; gerekirse yeniden kurun. | +| **Resim olmayan dosyalar** | Script rastgele bir `.pdf` dosyasında çöküyor | `if not file_name.lower().endswith(...)` kontrolü zaten onları atlıyor. | +| **Yazım denetimi uygulanmadı** | Çıktı ham OCR ile aynı | Döngüden önce `ai_processor.set_post_processor` çağrısının yapıldığını doğrulayın. | +| **Yavaş toplu hız** | Görüntü başına >5 saniye sürüyor | İlk çalıştırmadan sonra `model_cfg.allow_auto_download = "false"` yaparak modelin tekrar indirilmesini engelleyin. | + +**Pro ipucu:** **Görüntülerden metin çıkar** işlemini İngilizce dışındaki bir dilde yapmak istiyorsanız, sadece `ocr_engine.language` değerini uygun enum ile değiştirin (ör. `aocr.Language.French`). Aynı AI post‑işlemci hâlâ yazım denetimi uygular, ancak en iyi sonuçlar için dil‑özel bir model tercih edebilirsiniz. + +--- + +## Özet & Sonraki Adımlar + +**Toplu OCR** sürecinin tamamını ele aldık: + +1. **Başlat** – İngilizce için düz‑metin OCR motoru. +2. **Yapılandır** – AI yazım denetimi modeli ve post‑işlemci bağlaması. +3. **Çalıştır** – Her resimde OCR yap ve AI otomatik olarak **OCR hatalarını düzelt**. +4. **Döngü** – Klasör üzerinden **görüntülerden metin çıkar** toplu olarak. +5. **Temizle** – İş bittiğinde **AI kaynaklarını serbest bırak**. + +Bundan sonra şunları yapabilirsiniz: + +- Düzeltlenmiş metni bir sonraki NLP işlem hattına (duygu analizi, varlık çıkarımı vb.) yönlendirin. +- `ai_processor.set_post_processor(your_custom_func, {})` ile yazım denetimini özel bir özetleyiciyle değiştirin. +- GPU birden fazla akışı kaldırabiliyorsa, `concurrent.futures.ThreadPoolExecutor` ile klasör döngüsünü paralelleştirin. + +--- + +## Son Düşünceler + +Toplu OCR bir zahmet olmak zorunda değil. Aspose OCR’u hafif bir AI modeliyle birleştirerek **görüntülerden metin çıkar**, **yazım denetimi uygula**, **OCR hatalarını düzelt** ve **AI kaynaklarını temiz** bir çözüm elde edersiniz. Betiği bir test klasöründe çalıştırın, GPU katman sayısını donanımınıza göre ayarlayın ve dakikalar içinde üretime hazır bir işlem hattına sahip olun. + +Modeli özelleştirme, PDF işleme ya da bu çözümü bir web servisine entegre etme konularında sorularınız mı var? Aşağıya yorum bırakın ya da GitHub’da bana mesaj atın. İyi kodlamalar, OCR’unuz her zaman doğru olsun! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/turkish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/turkish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..cfaa8e727 --- /dev/null +++ b/ocr/turkish/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,299 @@ +--- +category: general +date: 2026-05-03 +description: PNG görüntü dosyalarını nasıl yükleyeceğinizi, görüntüden metin tanımayı + ve toplu OCR işleme için ücretsiz AI kaynaklarını gösteren Python OCR öğreticisi. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: tr +og_description: Python OCR öğreticisi, PNG görüntülerini yükleme, görüntüden metin + tanıma ve toplu OCR işleme için ücretsiz AI kaynaklarını yönetme konusunda size + rehberlik eder. +og_title: Python OCR Öğreticisi – Ücretsiz AI Kaynaklarıyla Hızlı Toplu OCR +tags: +- OCR +- Python +- AI +title: Python OCR Eğitimi – Toplu OCR İşlemi Kolaylaştırıldı +url: /tr/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Python OCR Eğitimi – Toplu OCR İşleme Kolaylaştırıldı + +Hiç **python ocr tutorial** gibi, saçınızı yolmak zorunda kalmadan onlarca PNG dosyasında OCR çalıştırmanıza izin veren bir şey aradınız mı? Yalnız değilsiniz. Birçok gerçek‑dünyada projede **load png image** dosyalarını yüklemeniz, bir motorun içine vermeniz ve işiniz bittiğinde AI kaynaklarını temizlemeniz gerekir. + +Bu rehberde, **recognize text from image** dosyalarından tam olarak nasıl metin tanınacağını, toplu olarak işleneceğini ve temel AI belleğinin nasıl serbest bırakılacağını gösteren eksiksiz, çalıştırmaya hazır bir örnek üzerinden adım adım ilerleyeceğiz. Sonunda, herhangi bir projeye ekleyebileceğiniz, ekstra süslemeler olmadan sadece temel gereksinimleri içeren bağımsız bir betiğe sahip olacaksınız. + +## Gereksinimler + +- Python 3.10 ve üzeri (burada kullanılan sözdizimi f‑string'ler ve tip ipuçlarına dayanır) +- Bir OCR kütüphanesi, `engine.recognize` metodunu sunmalı – demo amaçlı hayali bir `aocr` paketini varsayacağız, ancak Tesseract, EasyOCR vb. ile değiştirebilirsiniz. +- `ai` yardımcı modülü, kod snippet'inde gösterildiği gibi (model başlatma ve kaynak temizleme işlemlerini yönetir). +- İşlemek istediğiniz PNG dosyalarıyla dolu bir klasör + +`aocr` veya `ai` yüklü değilse, stub'larla taklit edebilirsiniz – sonuna yakın “Optional Stubs” bölümüne bakın. + +## Adım 1: AI Motorunu Başlatma (AI Kaynaklarını Serbest Bırakma) + +Herhangi bir resmi OCR işlem hattına vermeden önce, temel modelin hazır olması gerekir. Sadece bir kez başlatmak bellek tasarrufu sağlar ve toplu işler için hızı artırır. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Neden önemli:** +`ai.initialize`'ı her resim için tekrarlamalı olarak çağırmak GPU belleğini defalarca ayırır ve sonunda betiğin çökmesine neden olur. `ai.is_initialized()` kontrolüyle tek bir tahsis garantilenir – bu “AI kaynaklarını serbest bırakma” ilkesidir. + +## Adım 2: Toplu OCR İşleme için PNG Resim Dosyalarını Yükleme + +Şimdi OCR ile işlemek istediğimiz tüm PNG dosyalarını topluyoruz. `pathlib` kullanmak kodun işletim sistemi bağımsız olmasını sağlar. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Köşe durumu:** +Klasör PNG olmayan dosyalar (ör. JPEG) içeriyorsa bunlar göz ardı edilir, `engine.recognize`'ın desteklenmeyen bir formatta takılmasını önler. + +## Adım 3: Her Resimde OCR Çalıştırma ve Son İşlem Uygulama + +Motor hazır ve dosya listesi oluşturulduğunda, resimler üzerinde döngü kurabilir, ham metni çıkarabilir ve yaygın OCR artefaktlarını (ör. gereksiz satır sonları) temizleyen bir post‑processöre verebiliriz. + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Neden yüklemeyi tanımadan ayırıyoruz:** +`aocr.Image.load` tembel kod çözme yapabilir, bu da büyük toplular için daha hızlıdır. Yükleme adımını açık tutmak, daha sonra JPEG veya TIFF dosyalarını işlemek isterseniz farklı bir görüntü kütüphanesine geçişi de kolaylaştırır. + +## Adım 4: Temizleme – Toplu İşlem Sonrası AI Kaynaklarını Serbest Bırakma + +Toplu işlem tamamlandığında, özellikle GPU destekli makinelerde bellek sızıntılarını önlemek için modeli serbest bırakmalıyız. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Hepsini Bir Araya Getirme – Tam Script + +Aşağıda, dört adımı tutarlı bir iş akışına birleştiren tek bir dosya bulunmaktadır. `batch_ocr.py` olarak kaydedin ve komut satırından çalıştırın. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Beklenen Çıktı + +Üç PNG içeren bir klasörde scripti çalıştırmak şu çıktıyı verebilir: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +`ocr_results.txt` dosyası, her resim için net bir ayırıcı ve ardından temizlenmiş OCR metnini içerecek. + +## aocr ve ai için Opsiyonel Stub'lar (Gerçek Paketleriniz Yoksa) + +Ağır OCR kütüphanelerini dahil etmeden akışı test etmek istiyorsanız, minimal taklit modüller oluşturabilirsiniz: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Bu klasörleri `batch_ocr.py`'nin yanına yerleştirin, script çalışacak ve taklit sonuçları yazdıracak. + +## Profesyonel İpuçları & Yaygın Tuzaklar + +- **Bellek dalgalanmaları:** Binlerce yüksek çözünürlüklü PNG işliyorsanız, OCR'den önce yeniden boyutlandırmayı düşünün. `aocr.Image.load` genellikle bir `max_size` argümanını kabul eder. +- **Unicode yönetimi:** Çıktı dosyasını her zaman `encoding="utf-8"` ile açın; OCR motorları ASCII olmayan karakterler üretebilir. +- **Paralellik:** CPU‑ağır OCR için `ocr_batch`'i bir `concurrent.futures.ThreadPoolExecutor` içinde sarabilirsiniz. Tek bir `ai` örneği tutmayı unutmayın – her biri `ai.initialize` çağıran çok sayıda iş parçacığı oluşturmak “AI kaynaklarını serbest bırakma” hedefini bozar. +- **Hata dayanıklılığı:** Tek bir bozuk PNG'nin tüm toplu işlemi durdurmaması için her‑resim döngüsünü bir `try/except` bloğuna sarın. + +## Sonuç + +Artık **python ocr tutorial**'ı, **load png image** dosyalarını nasıl yükleyeceğinizi, **batch OCR processing**'i nasıl gerçekleştireceğinizi ve **free AI resources**'ı sorumlu bir şekilde nasıl yöneteceğinizi gösteren bir örneğe sahipsiniz. Tam, çalıştırılabilir örnek, **recognize text from image** nesnelerinden metin tanımanın ve sonrasında temizlemenin tam olarak nasıl yapılacağını gösterir, böylece eksik parçaları aramadan kendi projelerinize kopyala‑yapıştır yapabilirsiniz. + +Bir sonraki adıma hazır mısınız? Stub'lanmış `aocr` ve `ai` modüllerini gerçek kütüphanelerle, örneğin `pytesseract` ve `torchvision` ile değiştirin. Scripti JSON çıktı vermek, sonuçları bir veritabanına itmek ya da bir bulut depolama kovasına entegre etmek için de genişletebilirsiniz. Gökyüzü sınır—iyi kodlamalar! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/turkish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/turkish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..9e6fc2615 --- /dev/null +++ b/ocr/turkish/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: Görüntü üzerinde OCR çalıştırmayı ve yapılandırılmış OCR tanıma kullanarak + koordinatlarla metin çıkarmayı öğrenin. Adım adım Python kodu dahil. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: tr +og_description: Görüntüde OCR çalıştırın ve yapılandırılmış OCR tanıma kullanarak + koordinatlarıyla birlikte metni elde edin. Açıklamalı tam Python örneği. +og_title: Görselde OCR Çalıştır – Yapılandırılmış Metin Çıkarma Öğreticisi +tags: +- OCR +- Python +- Computer Vision +title: Görüntüde OCR Çalıştırma – Yapılandırılmış Metin Çıkarma İçin Tam Kılavuz +url: /tr/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Görüntüde OCR Çalıştırma – Yapılandırılmış Metin Çıkarma İçin Tam Kılavuz + +Hiç **run OCR on image** dosyalarını çalıştırmanız gerekti ama her kelimenin tam konumunu nasıl koruyacağınızdan emin değildiniz? Yalnız değilsiniz. Birçok projede—makbuz tarama, form dijitalleştirme veya UI testi—yalnız ham metni değil, aynı zamanda her satırın resimde nerede olduğunu gösteren sınırlama kutularına da ihtiyacınız var. + +Bu öğretici, **aocr** motorunu kullanarak *run OCR on image* işlemini, **structured OCR recognition** talep etmeyi ve ardından sonucu geometrisini koruyarak post‑process etmeyi gösterir. Sonunda sadece birkaç Python satırıyla **extract text with coordinates** yapabilecek ve yapılandırılmış modun sonraki görevler için neden önemli olduğunu anlayacaksınız. + +## Öğrenecekleriniz + +- **structured OCR recognition** için OCR motorunu nasıl başlatacağınızı. +- Bir görüntüyü nasıl besleyeceğinizi ve satır sınırlarını içeren ham sonuçları nasıl alacağınızı. +- Geometrisini kaybetmeden metni temizleyen bir post‑processor'ı nasıl çalıştıracağınızı. +- Son satırlar üzerinde nasıl döngü yapıp her metin parçasını sınırlama kutusuyla birlikte nasıl yazdıracağınızı. + +Sihir yok, gizli adım yok—kendi projenize ekleyebileceğiniz tam, çalıştırılabilir bir örnek. + +--- + +## Önkoşullar + +İçeriğe girmeden önce, aşağıdakilerin yüklü olduğundan emin olun: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Ayrıca net, okunabilir metin içeren bir görüntü dosyasına (`input_image.png` veya `.jpg`) ihtiyacınız olacak. Tarama faturası olsun ya da ekran görüntüsü, OCR motorunun karakterleri görebildiği sürece işe yarar. + +--- + +## Adım 1: Yapılandırılmış tanıma için OCR motorunu başlatma + +İlk yaptığımız şey `aocr.Engine()` bir örnek oluşturmak ve ona **structured OCR recognition** istediğimizi söylemektir. Yapılandırılmış mod yalnızca düz metni değil, aynı zamanda her satır için geometrik verileri (sınırlama dikdörtgenleri) döndürür; bu, metni görüntüye geri eşlemeniz gerektiğinde çok önemlidir. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Neden önemli:** +> Varsayılan modda motor yalnızca birleştirilmiş kelimeler dizisi verebilir. Yapılandırılmış mod, sayfalar → satırlar → kelimeler hiyerarşisini, her biri koordinatlarla birlikte, sağlar; bu da sonuçları orijinal görüntünün üzerine yerleştirmeyi veya bir layout‑aware modele beslemeyi çok daha kolaylaştırır. + +--- + +## Adım 2: Görüntüde OCR Çalıştırma ve ham sonuçları elde etme + +Şimdi görüntüyü motora besliyoruz. `recognize` çağrısı, her biri kendi sınırlama dikdörtgenine sahip satırların bir koleksiyonunu içeren bir `OcrResult` nesnesi döndürür. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +Bu noktada `raw_result.lines` iki önemli özelliğe sahip nesneler tutar: + +- `text` – o satır için tanınan dize. +- `bounds` – satırın konumunu tanımlayan `(x, y, width, height)` gibi bir tuple. + +--- + +## Adım 3: Geometriyi koruyarak post‑process yapma + +Ham OCR çıktısı genellikle gürültülüdür: rastgele karakterler, yanlış yerleştirilmiş boşluklar veya satır sonu sorunları. `ai.run_postprocessor` işlevi metni temizler ancak **orijinal geometriyi** bozulmadan tutar, böylece hâlâ doğru koordinatlara sahipsiniz. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Pro ipucu:** Alan‑spesifik sözlükleriniz (ör. ürün kodları) varsa, doğruluğu artırmak için post‑processor'a özel bir sözlük besleyin. + +--- + +## Adım 4: Koordinatlarla metin çıkarma – yinele ve göster + +Son olarak, temizlenmiş satırlar üzerinde döngü yapar, her satırın sınırlama kutusunu metniyle birlikte yazdırırız. Bu, **extract text with coordinates** işleminin özüdür. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Beklenen Çıktı + +Girdi görüntüsünün iki satır içerdiğini varsayalım: “Invoice #12345” ve “Total: $89.99”, aşağıdakine benzer bir şey göreceksiniz: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +İlk tuple, orijinal görüntüdeki satırın `(x, y, width, height)` değeridir; bu sayede dikdörtgen çizebilir, metni vurgulayabilir veya koordinatları başka bir sisteme besleyebilirsiniz. + +--- + +## Sonucu Görselleştirme (İsteğe Bağlı) + +Sınırlama kutularının görüntü üzerine bindirilmiş halini görmek istiyorsanız, Pillow (PIL) kullanarak dikdörtgen çizebilirsiniz. Aşağıda hızlı bir kod parçacığı var; yalnızca ham verilere ihtiyacınız varsa atlayabilirsiniz. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![run OCR on image örneği, sınırlama kutularını gösteriyor](/images/ocr-bounding-boxes.png "run OCR on image – sınırlama kutusu katmanı") + +Yukarıdaki alt metin **primary keyword** içerir ve görüntü alt öznitelikleri için SEO gereksinimini karşılar. + +--- + +## Neden Yapılandırılmış OCR Tanıma, Basit Metin Çıkarma'dan Daha İyi + +Şöyle düşünebilirsiniz: “Sadece OCR çalıştırıp metni alabilir miyim? Neden geometriyle uğraşayım?” + +- **Spatial context:** Bir formdaki alanları eşlemeniz gerektiğinde (ör. “Date” tarih değerinin yanında), koordinatlar verinin *nerede* olduğunu söyler. +- **Multi‑column layouts:** Basit doğrusal metin sıralamayı kaybeder; yapılandırılmış veri sütun sırasını korur. +- **Post‑processing accuracy:** Kutunun boyutunu bilmek, bir kelimenin başlık mı, dipnot mu yoksa rastgele bir artefakt mı olduğunu belirlemenize yardımcı olur. + +Kısacası, **structured OCR recognition** daha akıllı veri akışları oluşturma esnekliği sağlar—veriyi bir veritabanına besliyor olun, aranabilir PDF'ler oluşturuyor olun veya layout'a saygı gösteren bir makine‑learning modeli eğitiyor olun. + +--- + +## Yaygın Kenar Durumları ve Nasıl Ele Alınır + +| Durum | Dikkat Edilmesi Gereken | Önerilen Çözüm | +|-----------|-------------------|---------------| +| **Döndürülmüş veya eğik görüntüler** | Sınırlama kutuları eksen dışı olabilir. | Deskewing (ör. OpenCV’nin `warpAffine`) ile ön‑işlem yapın. | +| **Çok küçük fontlar** | Motor karakterleri kaçırabilir, bu da boş satırlara yol açar. | Görüntü çözünürlüğünü artırın veya `ocr_engine.set_dpi(300)` kullanın. | +| **Karışık diller** | Yanlış dil modeli bozuk metne neden olabilir. | `ocr_engine.language = ["en", "de"]` tanıma öncesinde ayarlayın. | +| **Çakışan kutular** | Post‑processor iki satırı istemeden birleştirebilir. | `line.bounds` işleme sonrası doğrulayın; `ai.run_postprocessor` içinde eşik değerleri ayarlayın. | + +Bu senaryoları erken ele almak, özellikle çözümü günde yüzlerce belgeye ölçeklendirdiğinizde, ileride baş ağrısını önler. + +--- + +## Tam Uçtan Uca Script + +Aşağıda tüm adımları birleştiren tam, çalıştırmaya hazır program bulunuyor. Kopyala‑yapıştır, görüntü yolunu ayarla ve hazırsın. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Bu scripti çalıştırdığınızda: + +1. **Run OCR on image** yapılandırılmış modda. +2. **Extract text with coordinates** her satır için. +3. İsteğe bağlı olarak kutuları gösteren anotasyonlu bir PNG üretir. + +--- + +## Sonuç + +Artık **run OCR on image** ve **extract text with coordinates** işlemlerini **structured OCR recognition** kullanarak yapabileceğiniz sağlam, bağımsız bir çözümünüz var. Kod, motor başlatmadan post‑process ve görsel doğrulamaya kadar her adımı gösteriyor; böylece makbuzlar, formlar veya kesin metin konumlandırması gerektiren herhangi bir görsel belgeye uyarlayabilirsiniz. + +Sırada ne var? `aocr` motorunu başka bir kütüphane (Tesseract, EasyOCR) ile değiştirip yapılandırılmış çıktılarının nasıl farklılaştığını görün. Yazım denetimi veya özel regex filtreleri gibi farklı post‑processing stratejileriyle alanınız için doğruluğu artırın. Daha büyük bir veri akışı kuruyorsanız, `(text, bounds)` çiftlerini ilerideki analizler için bir veritabanında saklamayı düşünün. + +İyi kodlamalar, ve OCR projeleriniz her zaman doğru olsun! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/vietnamese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md b/ocr/vietnamese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md new file mode 100644 index 000000000..1dce91f9a --- /dev/null +++ b/ocr/vietnamese/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/_index.md @@ -0,0 +1,231 @@ +--- +category: general +date: 2026-05-03 +description: Trích xuất văn bản từ hình ảnh bằng Aspose OCR và kiểm tra chính tả AI. + Tìm hiểu cách OCR hình ảnh, tải hình ảnh cho OCR, nhận dạng văn bản từ hoá đơn và + giải phóng tài nguyên GPU. +draft: false +keywords: +- extract text from image +- how to ocr image +- load image for ocr +- release gpu resources +- recognize text from invoice +language: vi +og_description: Trích xuất văn bản từ hình ảnh bằng Aspose OCR và kiểm tra chính tả + AI. Hướng dẫn chi tiết từng bước về cách OCR hình ảnh, tải hình ảnh để OCR và giải + phóng tài nguyên GPU. +og_title: Trích xuất văn bản từ hình ảnh – Hướng dẫn toàn diện về OCR và Kiểm tra + chính tả +tags: +- OCR +- Aspose +- AI +- Python +title: Trích xuất văn bản từ hình ảnh – OCR với Aspose AI Spell‑Check +url: /vi/python/general/extract-text-from-image-ocr-with-aspose-ai-spell-check/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# trích xuất văn bản từ hình ảnh – Hướng dẫn OCR & Kiểm tra chính tả hoàn chỉnh + +Bạn đã bao giờ cần **trích xuất văn bản từ hình ảnh** nhưng không chắc thư viện nào sẽ cung cấp cả tốc độ và độ chính xác? Bạn không phải là người duy nhất. Trong nhiều dự án thực tế—như xử lý hoá đơn, số hoá biên lai, hoặc quét hợp đồng—việc có được văn bản sạch, có thể tìm kiếm từ một bức ảnh là rào cản đầu tiên. + +Tin tốt là Aspose OCR kết hợp với một mô hình Aspose AI nhẹ có thể thực hiện công việc này chỉ trong vài dòng Python. Trong hướng dẫn này, chúng ta sẽ đi qua **cách OCR hình ảnh**, tải ảnh đúng cách, chạy bộ xử lý hậu kỳ kiểm tra chính tả tích hợp, và cuối cùng **giải phóng tài nguyên GPU** để ứng dụng của bạn tiết kiệm bộ nhớ. + +Khi kết thúc hướng dẫn này, bạn sẽ có thể **nhận dạng văn bản từ hình ảnh hoá đơn**, tự động sửa các lỗi OCR thường gặp, và giữ GPU của bạn sạch sẽ cho lô tiếp theo. + +--- + +## Những gì bạn cần + +- Python 3.9 hoặc mới hơn (code sử dụng type hints nhưng vẫn hoạt động trên các phiên bản 3.x cũ hơn) +- `aspose-ocr` và `aspose-ai` packages (cài đặt bằng `pip install aspose-ocr aspose-ai`) +- GPU hỗ trợ CUDA là tùy chọn; script sẽ chuyển sang CPU nếu không tìm thấy. +- Một hình ảnh mẫu, ví dụ `sample_invoice.png`, đặt trong thư mục bạn có thể tham chiếu. + +Không có khung ML nặng, không tải xuống mô hình khổng lồ—chỉ một mô hình lượng tử Q4‑K‑M nhỏ gọn, phù hợp với hầu hết các GPU. + +--- + +## Bước 1: Khởi tạo Engine OCR – trích xuất văn bản từ hình ảnh + +Điều đầu tiên bạn làm là tạo một thể hiện `OcrEngine` và chỉ định ngôn ngữ bạn mong đợi. Ở đây chúng ta chọn tiếng Anh và yêu cầu đầu ra dạng plain‑text, lý tưởng cho các bước xử lý tiếp theo. + +```python +import aocr # Aspose OCR package +import aspose.ai as ai # Aspose AI package + +# Initialise the OCR engine +ocr_engine = aocr.OcrEngine() +ocr_engine.language = aocr.Language.English # Choose any supported language +ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Plain text makes post‑processing easier +``` + +**Tại sao điều này quan trọng:** Đặt ngôn ngữ sẽ thu hẹp bộ ký tự, cải thiện độ chính xác. Chế độ plain‑text loại bỏ thông tin bố cục mà bạn thường không cần khi chỉ muốn trích xuất văn bản từ hình ảnh. + +--- + +## Bước 2: Tải ảnh cho OCR – cách OCR hình ảnh + +Bây giờ chúng ta cung cấp cho engine một bức ảnh thực tế. Hàm trợ giúp `Image.load` hiểu các định dạng phổ biến (PNG, JPEG, TIFF) và trừu tượng hoá các quirks của file‑IO. + +```python +# Load the input image – this is the "load image for OCR" step +input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") +raw_text = ocr_engine.recognize(input_image) # Returns the recognised text as a string +``` + +**Mẹo:** Nếu ảnh nguồn của bạn lớn, hãy cân nhắc thay đổi kích thước chúng trước khi gửi tới engine; kích thước nhỏ hơn có thể giảm sử dụng bộ nhớ GPU mà không làm giảm chất lượng nhận dạng. + +--- + +## Bước 3: Cấu hình mô hình Aspose AI – nhận dạng văn bản từ hoá đơn + +Aspose AI đi kèm với một mô hình GGUF siêu nhỏ mà bạn có thể tự động tải xuống. Ví dụ này sử dụng repository `Qwen2.5‑3B‑Instruct‑GGUF`, được lượng tử hoá thành `q4_k_m`. Chúng ta cũng chỉ định runtime cấp phát 20 lớp trên GPU, cân bằng tốc độ và việc sử dụng VRAM. + +```python +# Model configuration – auto‑download a small Q4‑K‑M quantised model +model_config = ai.AsposeAIModelConfig() +model_config.allow_auto_download = "true" +model_config.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" +model_config.hugging_face_quantization = "q4_k_m" +model_config.gpu_layers = 20 # Use 20 GPU layers when a GPU is available +``` + +**Phía sau:** Mô hình lượng tử có dung lượng khoảng 1.5 GB trên đĩa, chỉ một phần nhỏ so với mô hình độ chính xác đầy đủ, nhưng vẫn nắm bắt đủ ngữ nghĩa để phát hiện các lỗi chính tả OCR thường gặp. + +--- + +## Bước 4: Khởi tạo AsposeAI và gắn bộ xử lý hậu kỳ kiểm tra chính tả + +Aspose AI bao gồm một bộ xử lý hậu kỳ kiểm tra chính tả đã sẵn sàng. Khi gắn nó, mọi kết quả OCR sẽ được tự động làm sạch. + +```python +# Initialise AsposeAI and attach the built‑in spell‑check post‑processor +ocr_ai = ai.AsposeAI(model_config) # Pass the config we just built +ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) # Empty dict → default settings +``` + +**Tại sao nên dùng bộ xử lý hậu kỳ?** Các engine OCR thường đọc sai “Invoice” thành “Invo1ce” hoặc “Total” thành “T0tal”. Kiểm tra chính tả chạy một mô hình ngôn ngữ nhẹ trên chuỗi thô và sửa các lỗi này mà bạn không cần viết từ điển tùy chỉnh. + +--- + +## Bước 5: Chạy bộ xử lý hậu kỳ kiểm tra chính tả trên kết quả OCR + +Khi mọi thứ đã được kết nối, một lần gọi duy nhất sẽ trả về văn bản đã được sửa. Chúng tôi cũng in ra cả phiên bản gốc và phiên bản đã làm sạch để bạn có thể thấy sự cải thiện. + +```python +# Run the spell‑check post‑processor on the OCR result +corrected_text = ocr_ai.run_postprocessor(raw_text) + +print("Original :", raw_text) +print("Corrected:", corrected_text) +``` + +Kết quả mẫu cho một hoá đơn có thể trông như sau: + +``` +Original : Invo1ce #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +Corrected: Invoice #12345 +Date: 2023/07/15 +Total: $1,250.00 +... +``` + +Chú ý cách “Invo1ce” đã chuyển thành từ đúng “Invoice”. Đó là sức mạnh của bộ kiểm tra chính tả AI tích hợp. + +--- + +## Bước 6: Giải phóng tài nguyên GPU – giải phóng tài nguyên GPU một cách an toàn + +Nếu bạn chạy đoạn mã này trong một dịch vụ lâu dài (ví dụ, một web API xử lý hàng chục hoá đơn mỗi phút), bạn phải giải phóng ngữ cảnh GPU sau mỗi lô. Nếu không, bạn sẽ gặp rò rỉ bộ nhớ và cuối cùng nhận lỗi “CUDA out of memory”. + +```python +# Release GPU resources – crucial to avoid memory leaks +ocr_ai.free_resources() +``` + +**Mẹo chuyên nghiệp:** Gọi `free_resources()` trong một khối `finally` hoặc một context manager để nó luôn được thực thi, ngay cả khi có ngoại lệ xảy ra. + +--- + +## Ví dụ Hoạt động Đầy đủ + +Kết hợp tất cả các phần lại với nhau sẽ cho bạn một script tự chứa mà bạn có thể đưa vào bất kỳ dự án nào. + +```python +# extract_text_from_image.py +import aocr +import aspose.ai as ai + +def main(): + # Step 1: Initialise OCR engine + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain + + # Step 2: Load image for OCR + input_image = aocr.Image.load("YOUR_DIRECTORY/sample_invoice.png") + raw_text = ocr_engine.recognize(input_image) + + # Step 3: Configure Aspose AI model + model_cfg = ai.AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 20 + + # Step 4: Initialise AI and attach spell‑check + ocr_ai = ai.AsposeAI(model_cfg) + ocr_ai.set_post_processor(ocr_ai.postprocessor_spell_check, {}) + + # Step 5: Run spell‑check + corrected_text = ocr_ai.run_postprocessor(raw_text) + + print("Original :", raw_text) + print("Corrected:", corrected_text) + + # Step 6: Release GPU resources + ocr_ai.free_resources() + +if __name__ == "__main__": + main() +``` + +Lưu file, điều chỉnh đường dẫn tới ảnh của bạn, và chạy `python extract_text_from_image.py`. Bạn sẽ thấy văn bản hoá đơn đã được làm sạch được in ra console. + +--- + +## Câu hỏi Thường gặp (FAQ) + +**Q: Điều này có hoạt động trên máy chỉ có CPU không?** +A: Chắc chắn. Nếu không phát hiện GPU, Aspose AI sẽ chuyển sang thực thi trên CPU, mặc dù sẽ chậm hơn. Bạn có thể buộc sử dụng CPU bằng cách đặt `model_cfg.gpu_layers = 0`. + +**Q: Nếu hoá đơn của tôi ở ngôn ngữ khác tiếng Anh thì sao?** +A: Thay đổi `ocr_engine.language` thành giá trị enum phù hợp (ví dụ, `aocr.Language.Spanish`). Mô hình kiểm tra chính tả hỗ trợ đa ngôn ngữ, nhưng bạn có thể có kết quả tốt hơn với mô hình chuyên cho ngôn ngữ đó. + +**Q: Tôi có thể xử lý nhiều ảnh trong một vòng lặp không?** +A: Có. Chỉ cần đưa các bước tải, nhận dạng và xử lý hậu kỳ vào trong một vòng `for`. Nhớ gọi `ocr_ai.free_resources()` sau vòng lặp hoặc sau mỗi lô nếu bạn đang tái sử dụng cùng một instance AI. + +**Q: Kích thước tải xuống của mô hình là bao nhiêu?** +A: Khoảng 1.5 GB cho phiên bản lượng tử `q4_k_m`. Nó được lưu vào cache sau lần chạy đầu tiên, vì vậy các lần thực thi tiếp theo là ngay lập tức. + +--- + +## Kết luận + +Trong hướng dẫn này, chúng tôi đã trình bày cách **trích xuất văn bản từ hình ảnh** bằng Aspose OCR, cấu hình một mô hình AI siêu nhỏ, áp dụng bộ xử lý hậu kỳ kiểm tra chính tả, và an toàn **giải phóng tài nguyên GPU**. Quy trình bao gồm mọi thứ từ tải ảnh đến dọn dẹp sau khi sử dụng, cung cấp cho bạn một pipeline đáng tin cậy cho các trường hợp **nhận dạng văn bản từ hoá đơn**. + +Bước tiếp theo? Hãy thử thay thế bộ kiểm tra chính tả bằng một mô hình trích xuất thực thể tùy chỉnh + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/vietnamese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md b/ocr/vietnamese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md new file mode 100644 index 000000000..f8b02441a --- /dev/null +++ b/ocr/vietnamese/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/_index.md @@ -0,0 +1,215 @@ +--- +category: general +date: 2026-05-03 +description: Cách batch OCR hình ảnh bằng Aspose OCR và kiểm tra chính tả AI. Học + cách trích xuất văn bản từ hình ảnh, áp dụng kiểm tra chính tả, sử dụng tài nguyên + AI miễn phí và sửa lỗi OCR. +draft: false +keywords: +- how to batch ocr +- extract text from images +- free ai resources +- apply spell check +- correct ocr errors +language: vi +og_description: Cách thực hiện OCR hàng loạt cho hình ảnh bằng Aspose OCR và kiểm + tra chính tả AI. Tham khảo hướng dẫn từng bước để trích xuất văn bản từ hình ảnh, + áp dụng kiểm tra chính tả, sử dụng tài nguyên AI miễn phí và sửa lỗi OCR. +og_title: Cách thực hiện OCR hàng loạt với Aspose OCR – Hướng dẫn Python đầy đủ +tags: +- OCR +- Python +- AI +- Aspose +title: Cách thực hiện OCR hàng loạt với Aspose OCR – Hướng dẫn Python đầy đủ +url: /vi/python/general/how-to-batch-ocr-with-aspose-ocr-full-python-guide/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Cách thực hiện OCR hàng loạt với Aspose OCR – Hướng dẫn Python đầy đủ + +Bạn đã bao giờ tự hỏi **cách thực hiện OCR hàng loạt** cho một thư mục đầy các tệp PDF hoặc ảnh đã quét mà không cần viết một script riêng cho mỗi tệp chưa? Bạn không phải là người duy nhất. Trong nhiều quy trình thực tế, bạn sẽ cần **trích xuất văn bản từ hình ảnh**, sửa lỗi chính tả, và cuối cùng giải phóng bất kỳ tài nguyên AI nào bạn đã cấp phát. Hướng dẫn này sẽ chỉ cho bạn cách thực hiện điều đó với Aspose OCR, một bộ xử lý hậu AI nhẹ, và một vài dòng Python. + +Chúng ta sẽ đi qua các bước khởi tạo engine OCR, kết nối bộ kiểm tra chính tả AI, lặp qua một thư mục ảnh, và dọn dẹp mô hình sau khi hoàn thành. Khi kết thúc, bạn sẽ có một script sẵn sàng chạy mà **tự động sửa lỗi OCR** và giải phóng **các tài nguyên AI** để GPU của bạn luôn hoạt động tốt. + +## Những gì bạn cần + +- Python 3.9+ (mã sử dụng type‑hints nhưng vẫn hoạt động trên các phiên bản 3.x trước đó) +- `asposeocr` package (`pip install asposeocr`) – cung cấp engine OCR. +- Truy cập mô hình Hugging Face `bartowski/Qwen2.5-3B-Instruct-GGUF` (tự động tải về). +- GPU có ít nhất vài GB VRAM (script đặt `gpu_layers = 30`, bạn có thể giảm nếu cần). + +Không có dịch vụ bên ngoài, không có API trả phí – mọi thứ chạy cục bộ. + +--- + +## Bước 1: Thiết lập Engine OCR – **Cách thực hiện OCR hàng loạt** một cách hiệu quả + +Trước khi chúng ta có thể xử lý hàng nghìn hình ảnh, chúng ta cần một engine OCR vững chắc. Aspose OCR cho phép chúng ta chọn ngôn ngữ và chế độ nhận dạng trong một lần gọi. + +```python +# Step 1: Initialize the OCR engine for English plain‑text output +def init_ocr() -> aocr.OcrEngine: + ocr_engine = aocr.OcrEngine() + ocr_engine.language = aocr.Language.English # English language pack + ocr_engine.recognize_mode = aocr.RecognitionMode.Plain # Returns raw string, no layout + return ocr_engine +``` + +**Tại sao điều này quan trọng:** Đặt `recognize_mode` thành `Plain` giữ cho đầu ra nhẹ, lý tưởng khi bạn dự định chạy kiểm tra chính tả sau này. Nếu bạn cần thông tin bố cục, bạn sẽ chuyển sang `Layout`, nhưng điều này sẽ tăng tải mà bạn có thể không muốn trong một công việc batch. + +> **Mẹo chuyên nghiệp:** Nếu bạn đang xử lý các bản quét đa ngôn ngữ, bạn có thể truyền một danh sách như `ocr_engine.language = [aocr.Language.English, aocr.Language.Spanish]`. + +--- + +## Bước 2: Khởi tạo Bộ xử lý hậu AI – **Áp dụng kiểm tra chính tả** cho đầu ra OCR + +Aspose AI đi kèm với một bộ xử lý hậu tích hợp có thể chạy bất kỳ mô hình nào bạn muốn. Ở đây chúng ta tải mô hình Qwen 2.5 đã được lượng tử hoá từ Hugging Face và kết nối quy trình kiểm tra chính tả. + +```python +# Step 2: Configure and start the AI post‑processor +def init_ai() -> aocr.ai.AsposeAI: + model_cfg = AsposeAIModelConfig() + model_cfg.allow_auto_download = "true" + model_cfg.hugging_face_repo_id = "bartowski/Qwen2.5-3B-Instruct-GGUF" + model_cfg.hugging_face_quantization = "q4_k_m" + model_cfg.gpu_layers = 30 # Adjust based on your GPU memory + ai_processor = AsposeAI() + ai_processor.initialize(model_cfg) + + # Attach the built‑in spell‑check post‑processor + ai_processor.set_post_processor(ai_processor.postprocessor_spell_check, {}) + return ai_processor +``` + +**Tại sao điều này quan trọng:** Mô hình đã được lượng tử hoá (`q4_k_m`), giảm đáng kể việc sử dụng bộ nhớ trong khi vẫn cung cấp khả năng hiểu ngôn ngữ tốt. Bằng cách gọi `set_post_processor` chúng ta nói với Aspose AI thực hiện bước **apply spell check** tự động trên bất kỳ chuỗi nào chúng ta cung cấp. + +> **Cảnh báo:** Nếu GPU của bạn không thể xử lý 30 lớp, giảm số lượng xuống 15 hoặc thậm chí 5 – script vẫn sẽ hoạt động, chỉ chậm hơn một chút. + +--- + +## Bước 3: Chạy OCR và **Sửa lỗi OCR** trên một ảnh duy nhất + +Bây giờ cả engine OCR và bộ kiểm tra chính tả AI đã sẵn sàng, chúng ta kết hợp chúng. Hàm này tải một ảnh, trích xuất văn bản thô, sau đó chạy bộ xử lý hậu AI để làm sạch. + +```python +# Step 3: OCR an image and run the spell‑check post‑processor +def ocr_and_correct(image_path: str, + ocr_engine: aocr.OcrEngine, + ai_processor: aocr.ai.AsposeAI) -> str: + image = aocr.Image.load(image_path) # Load any supported format + raw_text = ocr_engine.recognize(image) # Plain string from OCR + corrected_text = ai_processor.run_postprocessor(raw_text) + return corrected_text +``` + +**Tại sao điều này quan trọng:** Việc đưa trực tiếp chuỗi OCR thô vào mô hình AI cho chúng ta một bước **correct OCR errors** mà không cần viết regex hay từ điển tùy chỉnh. Mô hình hiểu ngữ cảnh, vì vậy nó có thể sửa “recieve” → “receive” và các lỗi tinh tế hơn. + +--- + +## Bước 4: **Trích xuất văn bản từ hình ảnh** hàng loạt – Vòng lặp batch thực tế + +Đây là nơi phép màu của **cách thực hiện OCR hàng loạt** tỏa sáng. Chúng ta lặp qua một thư mục, bỏ qua các tệp không hỗ trợ, và ghi mỗi kết quả đã sửa vào tệp `.txt`. + +```python +# Step 4: Process an entire folder of images +if __name__ == "__main__": + # Initialize once – reuse for every file + ocr_engine = init_ocr() + ai_processor = init_ai() + + input_dir = "YOUR_DIRECTORY/input_images" + output_dir = "YOUR_DIRECTORY/output_text" + os.makedirs(output_dir, exist_ok=True) + + for file_name in os.listdir(input_dir): + # Only handle common image extensions + if not file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')): + continue + + image_path = os.path.join(input_dir, file_name) + corrected = ocr_and_correct(image_path, ocr_engine, ai_processor) + + txt_path = os.path.join(output_dir, + os.path.splitext(file_name)[0] + ".txt") + with open(txt_path, "w", encoding="utf-8") as txt_file: + txt_file.write(corrected) + + print(f"Processed {file_name}") + + # Step 5: Release **free AI resources** after the batch finishes + ai_processor.free_resources() +``` + +### Kết quả mong đợi + +Với một ảnh chứa câu *“The quick brown fox jumps over the lazzy dog.”* bạn sẽ thấy một tệp văn bản có: + +``` +The quick brown fox jumps over the lazy dog. +``` + +Lưu ý chữ “z” đôi đã được tự động sửa – đó là kết quả của AI spell‑check. + +**Tại sao điều này quan trọng:** Bằng cách tạo các đối tượng OCR và AI **một lần** và tái sử dụng chúng, chúng ta tránh được chi phí tải mô hình cho mỗi tệp. Đây là cách hiệu quả nhất để **cách thực hiện OCR hàng loạt** ở quy mô lớn. + +--- + +## Bước 5: Dọn dẹp – **Giải phóng tài nguyên AI** một cách đúng đắn + +Khi bạn hoàn thành, gọi `free_resources()` sẽ giải phóng bộ nhớ GPU, ngữ cảnh CUDA, và bất kỳ tệp tạm thời nào mà mô hình tạo ra. + +```python +# Step 5: Explicitly free GPU and model memory +ai_processor.free_resources() +``` + +Bỏ qua bước này có thể để lại các bộ nhớ GPU chưa được giải phóng, có thể gây crash cho các tiến trình Python tiếp theo hoặc tiêu tốn VRAM. Hãy nghĩ nó như phần “tắt đèn” của một công việc batch. + +--- + +## Những khó khăn thường gặp & Mẹo bổ sung + +| Vấn đề | Điều cần kiểm tra | Cách khắc phục | +|-------|------------------|----------------| +| **Lỗi hết bộ nhớ** | GPU hết bộ nhớ sau vài chục ảnh | Giảm `gpu_layers` hoặc chuyển sang CPU (`model_cfg.gpu_layers = 0`). | +| **Thiếu gói ngôn ngữ** | OCR trả về chuỗi rỗng | Đảm bảo phiên bản `asposeocr` bao gồm dữ liệu ngôn ngữ tiếng Anh; cài lại nếu cần. | +| **Tệp không phải ảnh** | Script bị crash khi gặp `.pdf` lẻ | Điều kiện `if not file_name.lower().endswith(...)` đã bỏ qua chúng. | +| **Kiểm tra chính tả không được áp dụng** | Đầu ra giống hệt OCR thô | Kiểm tra `ai_processor.set_post_processor` đã được gọi trước vòng lặp. | +| **Tốc độ batch chậm** | Mất >5 giây cho mỗi ảnh | Bật `model_cfg.allow_auto_download = "false"` sau lần chạy đầu, để mô hình không tải lại mỗi lần. | + +**Mẹo:** Nếu bạn cần **trích xuất văn bản từ hình ảnh** bằng ngôn ngữ khác tiếng Anh, chỉ cần thay đổi `ocr_engine.language` thành enum phù hợp (ví dụ, `aocr.Language.French`). Bộ xử lý hậu AI vẫn sẽ áp dụng kiểm tra chính tả, nhưng bạn có thể muốn một mô hình đặc thù cho ngôn ngữ đó để có kết quả tốt nhất. + +--- + +## Tóm tắt & Các bước tiếp theo + +Chúng ta đã bao quát toàn bộ quy trình cho **cách thực hiện OCR hàng loạt**: + +1. **Khởi tạo** một engine OCR dạng plain‑text cho tiếng Anh. +2. **Cấu hình** mô hình kiểm tra chính tả AI và gắn nó làm bộ xử lý hậu. +3. **Chạy** OCR trên mỗi ảnh và để AI **sửa lỗi OCR** tự động. +4. **Lặp** qua một thư mục để **trích xuất văn bản từ hình ảnh** hàng loạt. +5. **Giải phóng tài nguyên AI** khi công việc kết thúc. + +Từ đây bạn có thể: + +- Đưa văn bản đã sửa vào pipeline NLP downstream (phân tích cảm xúc, trích xuất thực thể, v.v.). +- Thay bộ xử lý hậu kiểm tra chính tả bằng một summarizer tùy chỉnh bằng cách gọi `ai_processor.set_post_processor(your_custom_func, {})`. +- Song song hoá vòng lặp thư mục bằng `concurrent.futures.ThreadPoolExecutor` nếu GPU của bạn có thể xử lý nhiều luồng. + +--- + +## Suy nghĩ cuối cùng + +Thực hiện OCR hàng loạt không cần phải là một công việc nặng nhọc. Bằng cách kết hợp Aspose OCR với một mô hình AI nhẹ, bạn có được **giải pháp một cửa** có thể **trích xuất văn bản từ hình ảnh**, **áp dụng kiểm tra chính tả**, **sửa lỗi OCR**, và **giải phóng tài nguyên AI** một cách sạch sẽ. Hãy chạy script trên một thư mục thử nghiệm, điều chỉnh số lớp GPU cho phù hợp với phần cứng của bạn, và bạn sẽ có một pipeline sẵn sàng sản xuất trong vài phút. + +Có câu hỏi về việc tùy chỉnh mô hình, xử lý PDF, hoặc tích hợp vào dịch vụ web? Để lại bình luận bên dưới hoặc nhắn tin cho tôi trên GitHub. Chúc lập trình vui vẻ, và chúc OCR của bạn luôn chính xác! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/vietnamese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md b/ocr/vietnamese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md new file mode 100644 index 000000000..c9188429c --- /dev/null +++ b/ocr/vietnamese/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/_index.md @@ -0,0 +1,300 @@ +--- +category: general +date: 2026-05-03 +description: Hướng dẫn OCR bằng Python, cho thấy cách tải tệp ảnh PNG, nhận dạng văn + bản từ hình ảnh và các tài nguyên AI miễn phí cho xử lý OCR hàng loạt. +draft: false +keywords: +- python ocr tutorial +- batch ocr processing +- free ai resources +- load png image +- recognize text from image +language: vi +og_description: Hướng dẫn OCR bằng Python sẽ hướng dẫn bạn cách tải ảnh PNG, nhận + dạng văn bản từ hình ảnh và sử dụng các tài nguyên AI miễn phí cho việc xử lý OCR + hàng loạt. +og_title: Hướng dẫn OCR bằng Python – Nhận dạng ký tự nhanh hàng loạt với tài nguyên + AI miễn phí +tags: +- OCR +- Python +- AI +title: Hướng dẫn OCR bằng Python – Xử lý OCR hàng loạt dễ dàng +url: /vi/python/general/python-ocr-tutorial-batch-ocr-processing-made-easy/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Hướng Dẫn Python OCR – Xử Lý Hàng Loạt OCR Dễ Dàng + +Bạn đã bao giờ cần một **python ocr tutorial** thực sự cho phép bạn chạy OCR trên hàng chục tệp PNG mà không phải đau đầu không? Bạn không phải là người duy nhất. Trong nhiều dự án thực tế, bạn phải **load png image** các tệp, đưa chúng vào một engine, và sau đó dọn dẹp các tài nguyên AI khi hoàn thành. + +Trong hướng dẫn này, chúng tôi sẽ đi qua một ví dụ hoàn chỉnh, sẵn sàng chạy, cho thấy chính xác cách **recognize text from image** các tệp, xử lý chúng theo lô, và giải phóng bộ nhớ AI nền. Khi kết thúc, bạn sẽ có một script tự chứa mà bạn có thể đưa vào bất kỳ dự án nào—không có phần thừa, chỉ những điều cần thiết. + +## Những Gì Bạn Cần + +- Python 3.10 hoặc mới hơn (cú pháp ở đây dựa vào f‑strings và type hints) +- Thư viện OCR cung cấp phương thức `engine.recognize` – cho mục đích demo, chúng tôi sẽ giả định một gói `aocr` giả tưởng, nhưng bạn có thể thay bằng Tesseract, EasyOCR, v.v. +- Module trợ giúp `ai` được hiển thị trong đoạn mã (nó xử lý khởi tạo mô hình và dọn dẹp tài nguyên) +- Một thư mục chứa đầy các tệp PNG bạn muốn xử lý + +Nếu bạn chưa cài đặt `aocr` hoặc `ai`, bạn có thể mô phỏng chúng bằng các stub – xem phần “Optional Stubs” ở cuối. + +## Bước 1: Khởi Tạo Engine AI (Giải Phóng Tài Nguyên AI) + +Trước khi đưa bất kỳ hình ảnh nào vào quy trình OCR, mô hình nền phải sẵn sàng. Khởi tạo chỉ một lần giúp tiết kiệm bộ nhớ và tăng tốc các công việc batch. + +```python +# step_1_initialize.py +import ai # hypothetical helper that wraps the AI model +import aocr # OCR library + +def init_engine(config_path: str = "config.yaml"): + """ + Initialize the AI engine if it hasn't been set up yet. + This uses free AI resources – the engine will be released later. + """ + if not ai.is_initialized(): + ai.initialize(config_path) # auto‑initialize with the provided configuration + else: + print("Engine already initialized.") +``` + +**Tại sao điều này quan trọng:** +Gọi `ai.initialize` lặp lại cho mỗi hình ảnh sẽ phân bổ bộ nhớ GPU liên tục, cuối cùng làm script bị sập. Bằng cách kiểm tra `ai.is_initialized()` chúng ta đảm bảo chỉ một lần phân bổ – đó là nguyên tắc “giải phóng tài nguyên AI”. + +## Bước 2: Tải Các Tệp Ảnh PNG cho Xử Lý OCR Hàng Loạt + +Bây giờ chúng ta thu thập tất cả các tệp PNG muốn chạy qua OCR. Sử dụng `pathlib` giúp mã không phụ thuộc vào hệ điều hành. + +```python +# step_2_load_images.py +from pathlib import Path +from typing import List + +def collect_png_paths(directory: str) -> List[Path]: + """ + Scan `directory` and return a list of Path objects pointing to PNG files. + """ + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files +``` + +**Trường hợp đặc biệt:** +Nếu thư mục chứa các tệp không phải PNG (ví dụ, JPEG) chúng sẽ bị bỏ qua, ngăn `engine.recognize` gặp lỗi do định dạng không được hỗ trợ. + +## Bước 3: Chạy OCR trên Mỗi Hình Ảnh và Áp Dụng Xử Lý Hậu + +Với engine đã sẵn sàng và danh sách tệp đã chuẩn bị, chúng ta có thể lặp qua các hình ảnh, trích xuất văn bản thô, và chuyển cho một bộ xử lý hậu kỳ để làm sạch các artefact OCR thường gặp (như các ngắt dòng lẻ). + +```python +# step_3_ocr_batch.py +import aocr +import ai +from pathlib import Path +from typing import List + +def ocr_batch(image_paths: List[Path]) -> List[str]: + """ + Perform OCR on each PNG image and return a list of cleaned strings. + """ + results = [] + for image_path in image_paths: + # Load the image – aocr.Image.load abstracts away Pillow/OpenCV details + img = aocr.Image.load(str(image_path)) + + # Recognize raw text + raw_text = engine.recognize(img) + + # Refine the raw OCR output using the AI post‑processor + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + + print(f"Processed {image_path.name}: {len(cleaned_text)} characters extracted.") + + return results +``` + +**Tại sao chúng ta tách việc tải ảnh khỏi nhận dạng:** +`aocr.Image.load` có thể thực hiện giải mã lười, nhanh hơn cho các batch lớn. Giữ bước tải ảnh rõ ràng cũng giúp dễ dàng thay thế bằng thư viện ảnh khác nếu sau này bạn cần xử lý JPEG hoặc TIFF. + +## Bước 4: Dọn Dẹp – Giải Phóng Tài Nguyên AI Sau Khi Batch Hoàn Thành + +Khi batch hoàn tất, chúng ta phải giải phóng mô hình để tránh rò rỉ bộ nhớ, đặc biệt trên các máy có GPU. + +```python +# step_4_cleanup.py +import ai + +def release_resources(): + """ + Free any allocated AI resources. Safe to call multiple times. + """ + if ai.is_initialized(): + ai.free_resources() + print("AI resources have been released.") + else: + print("No AI resources were allocated.") +``` + +## Kết Hợp Tất Cả – Script Hoàn Chỉnh + +Dưới đây là một tệp duy nhất ghép bốn bước lại thành một quy trình liền mạch. Lưu lại dưới tên `batch_ocr.py` và chạy từ dòng lệnh. + +```python +# batch_ocr.py +""" +Python OCR tutorial – end‑to‑end batch OCR processing. +Loads PNG images, runs OCR, post‑processes results, and frees AI resources. +""" + +import sys +from pathlib import Path +import ai +import aocr + +# ---------------------------------------------------------------------- +# Helper functions (copied from the steps above) +# ---------------------------------------------------------------------- +def init_engine(cfg: str = "config.yaml"): + if not ai.is_initialized(): + ai.initialize(cfg) + else: + print("Engine already initialized.") + +def collect_png_paths(directory: str): + base_path = Path(directory) + if not base_path.is_dir(): + raise NotADirectoryError(f"'{directory}' is not a valid folder.") + png_files = sorted(base_path.glob("*.png")) + if not png_files: + raise FileNotFoundError("No PNG images found in the specified directory.") + print(f"Found {len(png_files)} PNG image(s) to process.") + return png_files + +def ocr_batch(image_paths): + results = [] + for image_path in image_paths: + img = aocr.Image.load(str(image_path)) + raw_text = engine.recognize(img) + cleaned_text = ai.run_postprocessor(raw_text) + results.append(cleaned_text) + print(f"Processed {image_path.name}: {len(cleaned_text)} characters.") + return results + +def release_resources(): + if ai.is_initialized(): + ai.free_resources() + print("AI resources released.") + else: + print("No resources to release.") + +# ---------------------------------------------------------------------- +# Main execution block +# ---------------------------------------------------------------------- +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python batch_ocr.py ") + sys.exit(1) + + image_dir = sys.argv[1] + + try: + init_engine() + png_paths = collect_png_paths(image_dir) + texts = ocr_batch(png_paths) + + # Optional: write results to a single text file + output_file = Path("ocr_results.txt") + with output_file.open("w", encoding="utf-8") as f: + for path, txt in zip(png_paths, texts): + f.write(f"--- {path.name} ---\n") + f.write(txt + "\n\n") + print(f"All results saved to {output_file.resolve()}") + finally: + release_resources() +``` + +### Kết Quả Dự Kiến + +Chạy script trên một thư mục chứa ba PNG có thể in ra: + +``` +Engine already initialized. +Found 3 PNG image(s) to process. +Processed invoice1.png: 452 characters. +Processed receipt2.png: 317 characters. +Processed flyer3.png: 689 characters. +All results saved to /home/user/ocr_results.txt +AI resources released. +``` + +Tệp `ocr_results.txt` sẽ chứa một dấu phân cách rõ ràng cho mỗi hình ảnh, sau đó là văn bản OCR đã được làm sạch. + +## Các Stub Tùy Chọn cho aocr & ai (Nếu Bạn Không Có Gói Thực) + +Nếu bạn chỉ muốn thử luồng mà không kéo các thư viện OCR nặng, bạn có thể tạo các mô-đun mock tối thiểu: + +```python +# aocr/__init__.py +class Image: + @staticmethod + def load(path): + return f"ImageObject({path})" + +def dummy_recognize(image): + return "Raw OCR output for " + str(image) + +engine = type("Engine", (), {"recognize": dummy_recognize})() +``` + +```python +# ai/__init__.py +_state = {"initialized": False} + +def is_initialized(): + return _state["initialized"] + +def initialize(cfg): + print(f"Initializing AI engine with {cfg}") + _state["initialized"] = True + +def run_postprocessor(text): + # Very naive cleanup: strip extra spaces + return " ".join(text.split()) + +def free_resources(): + print("Freeing AI resources") + _state["initialized"] = False +``` + +Đặt các thư mục này bên cạnh `batch_ocr.py` và script sẽ chạy, in ra kết quả mock. + +## Mẹo Chuyên Gia & Những Cạm Bẫy Thường Gặp + +- **Memory spikes:** Nếu bạn đang xử lý hàng nghìn PNG độ phân giải cao, hãy cân nhắc thay đổi kích thước chúng trước khi OCR. `aocr.Image.load` thường chấp nhận đối số `max_size`. +- **Unicode handling:** Luôn mở tệp đầu ra với `encoding="utf-8"`; các engine OCR có thể phát ra ký tự không phải ASCII. +- **Parallelism:** Đối với OCR phụ thuộc CPU, bạn có thể bọc `ocr_batch` trong một `concurrent.futures.ThreadPoolExecutor`. Chỉ cần nhớ giữ một thể hiện `ai` duy nhất – tạo nhiều luồng mỗi luồng gọi `ai.initialize` sẽ phá vỡ mục tiêu “giải phóng tài nguyên AI”. +- **Error resilience:** Bao quanh vòng lặp per‑image bằng một khối `try/except` để một PNG bị hỏng không làm dừng toàn bộ batch. + +## Kết Luận + +Bây giờ bạn đã có một **python ocr tutorial** minh họa cách **load png image** các tệp, thực hiện **batch OCR processing**, và quản lý **free AI resources** một cách có trách nhiệm. Ví dụ hoàn chỉnh, có thể chạy này cho thấy chính xác cách **recognize text from image** các đối tượng và dọn dẹp sau đó, vì vậy bạn có thể sao chép‑dán nó vào dự án của mình mà không phải tìm kiếm các phần còn thiếu. + +Sẵn sàng cho bước tiếp theo? Hãy thử thay thế các mô-đun `aocr` và `ai` đã stub bằng các thư viện thực như `pytesseract` và `torchvision`. Bạn cũng có thể mở rộng script để xuất JSON, đẩy kết quả lên cơ sở dữ liệu, hoặc tích hợp với bucket lưu trữ đám mây. Không có giới hạn—chúc bạn lập trình vui vẻ! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file diff --git a/ocr/vietnamese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md b/ocr/vietnamese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md new file mode 100644 index 000000000..5e229169d --- /dev/null +++ b/ocr/vietnamese/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/_index.md @@ -0,0 +1,254 @@ +--- +category: general +date: 2026-05-03 +description: Tìm hiểu cách chạy OCR trên hình ảnh và trích xuất văn bản cùng tọa độ + bằng nhận dạng OCR có cấu trúc. Bao gồm mã Python từng bước. +draft: false +keywords: +- run OCR on image +- extract text with coordinates +- structured OCR recognition +- OCR post‑processing +- bounding box extraction +- image text detection +language: vi +og_description: Chạy OCR trên hình ảnh và lấy văn bản cùng tọa độ bằng nhận dạng OCR + có cấu trúc. Ví dụ Python đầy đủ kèm giải thích. +og_title: Chạy OCR trên hình ảnh – Hướng dẫn trích xuất văn bản có cấu trúc +tags: +- OCR +- Python +- Computer Vision +title: Chạy OCR trên hình ảnh – Hướng dẫn toàn diện về trích xuất văn bản có cấu trúc +url: /vi/python/general/run-ocr-on-image-complete-guide-to-structured-text-extractio/ +--- + +{{< blocks/products/pf/main-wrap-class >}} +{{< blocks/products/pf/main-container >}} +{{< blocks/products/pf/tutorial-page-section >}} + +# Chạy OCR trên hình ảnh – Hướng dẫn đầy đủ về Trích xuất Văn bản có cấu trúc + +Bạn đã bao giờ cần **run OCR on image** trên các tệp hình ảnh nhưng không chắc làm sao để giữ nguyên vị trí chính xác của từng từ chưa? Bạn không phải là người duy nhất. Trong nhiều dự án—quét biên lai, số hoá biểu mẫu, hoặc kiểm thử UI—bạn không chỉ cần văn bản thô mà còn cần các bounding box cho biết mỗi dòng nằm ở đâu trên ảnh. + +Hướng dẫn này sẽ cho bạn cách thực tế để *run OCR on image* bằng cách sử dụng engine **aocr**, yêu cầu **structured OCR recognition**, và sau đó thực hiện post‑process kết quả trong khi giữ nguyên hình học. Khi kết thúc, bạn sẽ có thể **extract text with coordinates** chỉ trong vài dòng Python, và bạn sẽ hiểu tại sao chế độ có cấu trúc lại quan trọng cho các tác vụ downstream. + +## Những gì bạn sẽ học + +- Cách khởi tạo engine OCR cho **structured OCR recognition**. +- Cách đưa một hình ảnh vào và nhận kết quả thô bao gồm giới hạn dòng. +- Cách chạy post‑processor để làm sạch văn bản mà không mất hình học. +- Cách lặp qua các dòng cuối cùng và in mỗi đoạn văn bản cùng với bounding box của nó. + +Không có phép màu, không có bước ẩn—chỉ một ví dụ hoàn chỉnh, có thể chạy được mà bạn có thể đưa vào dự án của mình. + +--- + +## Yêu cầu trước + +Trước khi chúng ta bắt đầu, hãy chắc chắn rằng bạn đã cài đặt các thành phần sau: + +```bash +pip install aocr ai # hypothetical packages; replace with real ones if needed +``` + +Bạn cũng sẽ cần một tệp hình ảnh (`input_image.png` hoặc `.jpg`) chứa văn bản rõ ràng, dễ đọc. Bất kỳ thứ gì từ hoá đơn đã quét đến ảnh chụp màn hình đều được, miễn là engine OCR có thể nhận thấy các ký tự. + +--- + +## Bước 1: Khởi tạo engine OCR cho nhận dạng có cấu trúc + +Điều đầu tiên chúng ta làm là tạo một thể hiện của `aocr.Engine()` và nói với nó rằng chúng ta muốn **structured OCR recognition**. Chế độ có cấu trúc không chỉ trả về văn bản thuần mà còn cả dữ liệu hình học (hình chữ nhật bao quanh) cho mỗi dòng, điều này rất quan trọng khi bạn cần ánh xạ văn bản trở lại hình ảnh. + +```python +import aocr +import ai # hypothetical post‑processing module + +# Initialise the OCR engine +ocr_engine = aocr.Engine() + +# Request structured recognition (text + geometry) +ocr_engine.recognize_mode = aocr.RecognitionMode.Structured +``` + +> **Tại sao điều này quan trọng:** +> Trong chế độ mặc định, engine có thể chỉ cung cấp cho bạn một chuỗi các từ nối liền nhau. Chế độ có cấu trúc cung cấp cho bạn một cấu trúc phân cấp các trang → dòng → từ, mỗi cái đều có tọa độ, giúp việc phủ kết quả lên ảnh gốc hoặc đưa chúng vào mô hình nhận thức bố cục dễ dàng hơn nhiều. + +--- + +## Bước 2: Chạy OCR trên hình ảnh và nhận kết quả thô + +Bây giờ chúng ta đưa hình ảnh vào engine. Lệnh `recognize` trả về một đối tượng `OcrResult` chứa một tập hợp các dòng, mỗi dòng có hình chữ nhật bao quanh riêng. + +```python +# Load your image (any format supported by aocr) +input_image_path = "input_image.png" + +# Run OCR – this returns an OcrResult with lines and bounds +raw_result = ocr_engine.recognize(input_image_path) +``` + +Tại thời điểm này `raw_result.lines` chứa các đối tượng với hai thuộc tính quan trọng: + +- `text` – chuỗi đã nhận dạng cho dòng đó. +- `bounds` – một tuple dạng `(x, y, width, height)` mô tả vị trí của dòng. + +--- + +## Bước 3: Post‑process trong khi giữ nguyên hình học + +Kết quả OCR thô thường nhiễu: ký tự lẻ, khoảng trắng sai vị trí, hoặc vấn đề ngắt dòng. Hàm `ai.run_postprocessor` làm sạch văn bản nhưng **giữ nguyên hình học gốc**, vì vậy bạn vẫn có tọa độ chính xác. + +```python +# Apply a post‑processing step that corrects common OCR errors +postprocessed_result = ai.run_postprocessor(raw_result) + +# The structure (lines + bounds) stays the same, only `line.text` changes +``` + +> **Mẹo chuyên nghiệp:** Nếu bạn có từ vựng chuyên ngành (ví dụ, mã sản phẩm), hãy cung cấp một từ điển tùy chỉnh cho post‑processor để cải thiện độ chính xác. + +--- + +## Bước 4: Trích xuất văn bản với tọa độ – lặp và hiển thị + +Cuối cùng, chúng ta lặp qua các dòng đã làm sạch, in ra bounding box của mỗi dòng cùng với văn bản của nó. Đây là phần cốt lõi của **extract text with coordinates**. + +```python +# Print each recognised line together with its bounding box +for line in postprocessed_result.lines: + print(f"[{line.bounds}] {line.text}") +``` + +### Kết quả dự kiến + +Giả sử ảnh đầu vào chứa hai dòng: “Invoice #12345” và “Total: $89.99”, bạn sẽ thấy một kết quả tương tự như: + +``` +[(15, 30, 210, 25)] Invoice #12345 +[(15, 70, 190, 25)] Total: $89.99 +``` + +Tuple đầu tiên là `(x, y, width, height)` của dòng trên ảnh gốc, cho phép bạn vẽ hình chữ nhật, tô sáng văn bản, hoặc đưa tọa độ vào hệ thống khác. + +--- + +## Trực quan hoá Kết quả (Tùy chọn) + +Nếu bạn muốn xem các bounding box được phủ lên ảnh, bạn có thể dùng Pillow (PIL) để vẽ hình chữ nhật. Dưới đây là một đoạn mã nhanh; bạn có thể bỏ qua nếu chỉ cần dữ liệu thô. + +```python +from PIL import Image, ImageDraw + +# Open the original image +img = Image.open(input_image_path) +draw = ImageDraw.Draw(img) + +# Draw a rectangle around each line +for line in postprocessed_result.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + +# Save or show the annotated image +img.save("annotated_output.png") +img.show() +``` + +![ví dụ chạy OCR trên hình ảnh hiển thị các bounding box](/images/ocr-bounding-boxes.png "chạy OCR trên hình ảnh – lớp phủ bounding box") + +Văn bản alt phía trên chứa **primary keyword**, đáp ứng yêu cầu SEO cho thuộc tính alt của hình ảnh. + +--- + +## Tại sao Structured OCR Recognition vượt trội so với Trích xuất Văn bản Đơn giản + +Bạn có thể tự hỏi, “Tôi không thể chỉ chạy OCR và lấy văn bản sao? Tại sao phải bận tâm đến hình học?” + +- **Spatial context:** Khi bạn cần ánh xạ các trường trên một biểu mẫu (ví dụ, “Date” bên cạnh giá trị ngày), tọa độ cho bạn biết *ở đâu* dữ liệu nằm. +- **Multi‑column layouts:** Văn bản tuyến tính đơn giản mất thứ tự; dữ liệu có cấu trúc bảo toàn thứ tự cột. +- **Post‑processing accuracy:** Biết kích thước hộp giúp bạn quyết định một từ là tiêu đề, chú thích, hay một mảnh vụn không mong muốn. + +Tóm lại, **structured OCR recognition** cung cấp cho bạn tính linh hoạt để xây dựng các pipeline thông minh hơn—cho dù bạn đang đưa dữ liệu vào cơ sở dữ liệu, tạo PDF có thể tìm kiếm, hoặc huấn luyện mô hình machine‑learning tôn trọng bố cục. + +--- + +## Các trường hợp góc cạnh thường gặp và Cách xử lý + +| Situation | What to Watch For | Suggested Fix | +|-----------|-------------------|---------------| +| **Hình ảnh bị xoay hoặc nghiêng** | Các bounding box có thể lệch trục. | Tiền xử lý bằng cách chỉnh nghiêng (ví dụ, `warpAffine` của OpenCV). | +| **Phông chữ rất nhỏ** | Engine có thể bỏ sót ký tự, dẫn đến các dòng trống. | Tăng độ phân giải ảnh hoặc sử dụng `ocr_engine.set_dpi(300)`. | +| **Ngôn ngữ hỗn hợp** | Mô hình ngôn ngữ sai có thể gây ra văn bản rối. | Đặt `ocr_engine.language = ["en", "de"]` trước khi nhận dạng. | +| **Các hộp chồng lên nhau** | Post‑processor có thể hợp nhất hai dòng một cách không mong muốn. | Xác minh `line.bounds` sau khi xử lý; điều chỉnh ngưỡng trong `ai.run_postprocessor`. | + +Xử lý những kịch bản này sớm sẽ giúp bạn tránh đau đầu sau này, đặc biệt khi bạn mở rộng giải pháp lên hàng trăm tài liệu mỗi ngày. + +--- + +## Script Đầy đủ Từ đầu đến cuối + +Dưới đây là chương trình hoàn chỉnh, sẵn sàng chạy, kết nối tất cả các bước lại với nhau. Sao chép‑dán, điều chỉnh đường dẫn ảnh, và bạn đã sẵn sàng. + +```python +# -*- coding: utf-8 -*- +""" +Run OCR on image – extract text with coordinates using structured OCR recognition. +Author: Your Name +Date: 2026-05-03 +""" + +import aocr +import ai +from PIL import Image, ImageDraw + +def run_structured_ocr(image_path: str, annotate: bool = False): + # 1️⃣ Initialise the OCR engine + ocr_engine = aocr.Engine() + ocr_engine.recognize_mode = aocr.RecognitionMode.Structured + + # 2️⃣ Recognise the image + raw_result = ocr_engine.recognize(image_path) + + # 3️⃣ Post‑process while keeping geometry + processed = ai.run_postprocessor(raw_result) + + # 4️⃣ Print each line with its bounding box + for line in processed.lines: + print(f"[{line.bounds}] {line.text}") + + # Optional visualisation + if annotate: + img = Image.open(image_path) + draw = ImageDraw.Draw(img) + for line in processed.lines: + x, y, w, h = line.bounds + draw.rectangle([x, y, x + w, y + h], outline="red", width=2) + annotated_path = "annotated_" + image_path + img.save(annotated_path) + print(f"Annotated image saved as {annotated_path}") + +if __name__ == "__main__": + INPUT_IMG = "input_image.png" + run_structured_ocr(INPUT_IMG, annotate=True) +``` + +Running this script will: + +1. **Run OCR on image** với chế độ có cấu trúc. +2. **Extract text with coordinates** cho mỗi dòng. +3. Tùy chọn tạo PNG đã chú thích hiển thị các hộp. + +--- + +## Kết luận + +Bây giờ bạn đã có một giải pháp vững chắc, tự chứa để **run OCR on image** và **extract text with coordinates** bằng **structured OCR recognition**. Mã nguồn minh họa mọi bước—từ khởi tạo engine đến post‑processing và xác minh trực quan—để bạn có thể áp dụng cho biên lai, biểu mẫu, hoặc bất kỳ tài liệu hình ảnh nào cần định vị văn bản chính xác. + +Tiếp theo? Hãy thử thay thế engine `aocr` bằng một thư viện khác (Tesseract, EasyOCR) và xem cách đầu ra có cấu trúc của chúng khác nhau như thế nào. Thử nghiệm các chiến lược post‑processing khác nhau, như kiểm tra chính tả hoặc bộ lọc regex tùy chỉnh, để nâng cao độ chính xác cho lĩnh vực của bạn. Và nếu bạn đang xây dựng một pipeline lớn hơn, hãy cân nhắc lưu trữ các cặp `(text, bounds)` trong cơ sở dữ liệu để phân tích sau này. + +Chúc lập trình vui vẻ, và chúc các dự án OCR của bạn luôn chính xác! + +{{< /blocks/products/pf/tutorial-page-section >}} +{{< /blocks/products/pf/main-container >}} +{{< /blocks/products/pf/main-wrap-class >}} +{{< blocks/products/products-backtop-button >}} \ No newline at end of file