ARTICLE AD BOX
var ocr = new IronTesseract();
ocr.Language = OcrLanguage.Bengali;
// Optimization for high-density forms (18 boxes)
ocr.Configuration.ReadBarCodes = false;
ocr.Configuration.PageSegmentationMode = TesseractPageSegmentationMode.Auto;
using (FolderBrowserDialog fbd = new FolderBrowserDialog())
{
if (fbd.ShowDialog() == DialogResult.OK)
{
string inputFolder = fbd.SelectedPath;
string outputFile = Path.Combine(inputFolder, "VoterData_18Box_Export.csv");
try
{
string[] files = Directory.GetFiles(inputFolder, "*.jpg");
int totalRecords = 0;
using (var writer = new StreamWriter(outputFile, false, Encoding.UTF8))
{
// Writing CSV Header
writer.WriteLine("নাম,ভোটার নং,পিতা,মাতা,পেশা,ঠিকানা");
foreach (string filePath in files)
{
using (var input = new OcrInput(filePath))
{
// Image optimization for dense text
input.DeNoise();
input.Contrast();
var result = ocr.Read(input);
string fullText = result.Text;
// SPLIT LOGIC: 18 boxes usually means 18 "নাম:" labels
// We split by the label to isolate each person's box
// string[] cardBlocks = fullText.Split(new[] { "নাম:" }, StringSplitOptions.RemoveEmptyEntries);
// string[] cardBlocks = Regex.Split(fullText, @"(?:\s+|^)নাম[:ঃ]", RegexOptions.Multiline);
var blocks = Regex.Split(fullText, @"(?:\s+|^)নাম[:ঃ]", RegexOptions.Multiline)
.Where(b => !string.IsNullOrWhiteSpace(b))
.ToList();
//foreach (string block in cardBlocks)
foreach (string block in blocks)
{
// Skip noise/empty blocks
if (!block.Contains("ভোটার নং:") && !block.Contains("পিতা:")) continue;
// 1. Extract Name (It's the text immediately after "নাম:" until the end of that line)
string name = GetFirstLine(block);
// 2. Extract other fields from the remaining block text
string voterId = GetValue(block, "ভোটার নং:");
string father = GetValue(block, "পিতা:");
string mother = GetValue(block, "মাতা:");
string job = GetValue(block, "পেশা:");
string address = GetValue(block, "ঠিকানা:");
// 3. Write to CSV
string line = $"{EscapeCsv(name)},{EscapeCsv(voterId)},{EscapeCsv(father)},{EscapeCsv(mother)},{EscapeCsv(job)},{EscapeCsv(address)}";
writer.WriteLine(line);
totalRecords++;
}
writer.Flush();
}
rtbStatus.AppendText($"Processed: {Path.GetFileName(filePath)} (Found {totalRecords} total)\n");
}
}
MessageBox.Show($"সম্পন্ন হয়েছে!\nমোট ছবি: {files.Length}\nমোট ভোটার: {totalRecords}");
}
catch (Exception ex)
{
MessageBox.Show("Error: " + ex.Message);
}
}
}
** In my Image, total 18 box with 6 row and 3 column .
** I need to read every row from left to right .
** The OCR Read From Left to right in every row, but sometime The OCR jump top to down in row. Example: The OCR read last box of 3 no of rows, it should go to 4 no row and read from left to right, but its jump 4 no row last box, So first two boxes is escape.
