Extract Bangla Text From Image

17 hours ago 3
ARTICLE AD BOX
var ocr = new IronTesseract(); ocr.Language = OcrLanguage.Bengali; // Optimization for high-density forms (18 boxes) ocr.Configuration.ReadBarCodes = false; ocr.Configuration.PageSegmentationMode = TesseractPageSegmentationMode.Auto; using (FolderBrowserDialog fbd = new FolderBrowserDialog()) { if (fbd.ShowDialog() == DialogResult.OK) { string inputFolder = fbd.SelectedPath; string outputFile = Path.Combine(inputFolder, "VoterData_18Box_Export.csv"); try { string[] files = Directory.GetFiles(inputFolder, "*.jpg"); int totalRecords = 0; using (var writer = new StreamWriter(outputFile, false, Encoding.UTF8)) { // Writing CSV Header writer.WriteLine("নাম,ভোটার নং,পিতা,মাতা,পেশা,ঠিকানা"); foreach (string filePath in files) { using (var input = new OcrInput(filePath)) { // Image optimization for dense text input.DeNoise(); input.Contrast(); var result = ocr.Read(input); string fullText = result.Text; // SPLIT LOGIC: 18 boxes usually means 18 "নাম:" labels // We split by the label to isolate each person's box // string[] cardBlocks = fullText.Split(new[] { "নাম:" }, StringSplitOptions.RemoveEmptyEntries); // string[] cardBlocks = Regex.Split(fullText, @"(?:\s+|^)নাম[:ঃ]", RegexOptions.Multiline); var blocks = Regex.Split(fullText, @"(?:\s+|^)নাম[:ঃ]", RegexOptions.Multiline) .Where(b => !string.IsNullOrWhiteSpace(b)) .ToList(); //foreach (string block in cardBlocks) foreach (string block in blocks) { // Skip noise/empty blocks if (!block.Contains("ভোটার নং:") && !block.Contains("পিতা:")) continue; // 1. Extract Name (It's the text immediately after "নাম:" until the end of that line) string name = GetFirstLine(block); // 2. Extract other fields from the remaining block text string voterId = GetValue(block, "ভোটার নং:"); string father = GetValue(block, "পিতা:"); string mother = GetValue(block, "মাতা:"); string job = GetValue(block, "পেশা:"); string address = GetValue(block, "ঠিকানা:"); // 3. Write to CSV string line = $"{EscapeCsv(name)},{EscapeCsv(voterId)},{EscapeCsv(father)},{EscapeCsv(mother)},{EscapeCsv(job)},{EscapeCsv(address)}"; writer.WriteLine(line); totalRecords++; } writer.Flush(); } rtbStatus.AppendText($"Processed: {Path.GetFileName(filePath)} (Found {totalRecords} total)\n"); } } MessageBox.Show($"সম্পন্ন হয়েছে!\nমোট ছবি: {files.Length}\nমোট ভোটার: {totalRecords}"); } catch (Exception ex) { MessageBox.Show("Error: " + ex.Message); } } }

** In my Image, total 18 box with 6 row and 3 column .
** I need to read every row from left to right .
** The OCR Read From Left to right in every row, but sometime The OCR jump top to down in row. Example: The OCR read last box of 3 no of rows, it should go to 4 no row and read from left to right, but its jump 4 no row last box, So first two boxes is escape.

Read Entire Article