Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions OCR/.NET MVC/OCR-with-Tesseract-in-Docker-on-Linux/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
**/.classpath
**/.dockerignore
**/.env
**/.git
**/.gitignore
**/.project
**/.settings
**/.toolstarget
**/.vs
**/.vscode
**/*.*proj.user
**/*.dbmdl
**/*.jfm
**/azds.yaml
**/bin
**/charts
**/docker-compose*
**/Dockerfile*
**/node_modules
**/npm-debug.log
**/obj
**/secrets.dev.yaml
**/values.dev.yaml
LICENSE
README.md
!**/.gitignore
!.git/HEAD
!.git/config
!.git/packed-refs
!.git/refs/heads/**
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.14.36616.10 d17.14
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OCR-with-Tesseract-in-Docker-on-Linux", "OCR-with-Tesseract-in-Docker-on-Linux\OCR-with-Tesseract-in-Docker-on-Linux.csproj", "{40EBF01A-F47E-433F-9C5F-1E118D6BE123}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{40EBF01A-F47E-433F-9C5F-1E118D6BE123}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{40EBF01A-F47E-433F-9C5F-1E118D6BE123}.Debug|Any CPU.Build.0 = Debug|Any CPU
{40EBF01A-F47E-433F-9C5F-1E118D6BE123}.Release|Any CPU.ActiveCfg = Release|Any CPU
{40EBF01A-F47E-433F-9C5F-1E118D6BE123}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {82BE2249-6CF6-4098-8CFB-FE96ABAEE376}
EndGlobalSection
EndGlobal
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
using Microsoft.AspNetCore.Mvc;
using OCR_with_Tesseract_in_Docker_on_Linux.Models;
using Syncfusion.Drawing;
using Syncfusion.OCRProcessor;
using Syncfusion.Pdf.Graphics;
using Syncfusion.Pdf.Parsing;
using System.Diagnostics;
using System.Xml.Linq;

namespace OCR_with_Tesseract_in_Docker_on_Linux.Controllers
{
public class HomeController : Controller
{
private readonly ILogger<HomeController> _logger;

public HomeController(ILogger<HomeController> logger)
{
_logger = logger;
}

public IActionResult Index()
{
return View();
}

public IActionResult Privacy()
{
return View();
}
public IActionResult PerformOCR()
{
string docPath = Path.GetFullPath(@"Data/Input.pdf");
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
FileStream fileStream = new FileStream(docPath, FileMode.Open, FileAccess.Read);
//Load a PDF document
PdfLoadedDocument lDoc = new PdfLoadedDocument(fileStream);
//Set OCR language to process
processor.Settings.Language = Languages.English;
IOcrEngine tesseractEngine = new Tesseract5OCREngine();
processor.ExternalEngine = tesseractEngine;
//Process OCR by providing the PDF document.
processor.PerformOCR(lDoc);
//Create memory stream
using (MemoryStream stream = new MemoryStream())
{
//Save the document to memory stream
lDoc.Save(stream);
lDoc.Close();
//Set the position as '0'
stream.Position = 0;
//Download the PDF document in the browser
FileStreamResult fileStreamResult = new FileStreamResult(stream, "application/pdf");
fileStreamResult.FileDownloadName = "Sample.pdf";
return fileStreamResult;
}
}

}
[ResponseCache(Duration = 0, Location = ResponseCacheLocation.None, NoStore = true)]
public IActionResult Error()
{
return View(new ErrorViewModel { RequestId = Activity.Current?.Id ?? HttpContext.TraceIdentifier });
}
}
// Tesseract5OcrEngine implementation
class Tesseract5OCREngine : IOcrEngine
{
private float imageHeight;
private float imageWidth;

public OCRLayoutResult PerformOCR(Stream stream)
{
if (stream == null || !stream.CanRead)
throw new ArgumentException("Input stream is null or not readable for OCR.", nameof(stream));

stream.Position = 0;

using (MemoryStream tempMemStream = new MemoryStream())
{
stream.CopyTo(tempMemStream);
tempMemStream.Position = 0;
PdfTiffImage pdfTiffImage = new PdfTiffImage(tempMemStream);
imageHeight = pdfTiffImage.Height;
imageWidth = pdfTiffImage.Width;
}

string tempImageFile = Path.GetTempFileName();
string tempHocrFile = tempImageFile + ".hocr";

// Write stream to temp image file
using (FileStream tempFileStream = new FileStream(tempImageFile, FileMode.Create, FileAccess.Write))
{
stream.Position = 0;
stream.CopyTo(tempFileStream);
}

ProcessStartInfo startInfo = new ProcessStartInfo
{
FileName = "tesseract",
Arguments = $"\"{tempImageFile}\" \"{tempImageFile}\" -l eng hocr",
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true
};

string hocrText = null;
using (Process process = new Process { StartInfo = startInfo })
{
process.Start();
string errorOutput = process.StandardError.ReadToEnd();
process.WaitForExit();

if (process.ExitCode != 0)
throw new Exception($"Tesseract process failed with exit code {process.ExitCode}. Error: {errorOutput}");

if (!File.Exists(tempHocrFile))
throw new Exception("HOCR output file not found. Tesseract might have failed or not produced output.");

hocrText = File.ReadAllText(tempHocrFile);
}

// Clean up temp files
if (File.Exists(tempImageFile)) File.Delete(tempImageFile);
if (File.Exists(tempHocrFile)) File.Delete(tempHocrFile);

if (string.IsNullOrEmpty(hocrText))
throw new Exception("HOCR text could not be generated or was empty.");

var ocrLayoutResult = new OCRLayoutResult();
BuildOCRLayoutResult(ocrLayoutResult, hocrText, imageWidth, imageHeight);
ocrLayoutResult.ImageWidth = imageWidth;
ocrLayoutResult.ImageHeight = imageHeight;

return ocrLayoutResult;
}

void BuildOCRLayoutResult(OCRLayoutResult ocr, string hOcrText, float imageWidth, float imageHeight)
{
var doc = XDocument.Parse(hOcrText, LoadOptions.None);
var ns = "http://www.w3.org/1999/xhtml";

foreach (var pageElement in doc.Descendants(ns + "div").Where(d => d.Attribute("class")?.Value == "ocr_page"))
{
Page ocrPage = new Page();

foreach (var lineElement in pageElement.Descendants(ns + "span")
.Where(s => s.Attribute("class")?.Value == "ocr_line" || s.Attribute("class")?.Value == "ocr_header"))
{
Line ocrLine = new Line();

foreach (var wordElement in lineElement.Descendants(ns + "span")
.Where(s => s.Attribute("class")?.Value == "ocrx_word"))
{
Word ocrWord = new Word { Text = wordElement.Value };
String title = wordElement.Attribute("title")?.Value;

if (title != null)
{
String bboxString = title.Split(';')[0].Replace("bbox", "").Trim();
int[] coords = bboxString.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(int.Parse).ToArray();

if (coords.Length == 4)
{
float x = coords[0];
float y = coords[1];
float width = coords[2] - coords[0];
float height = coords[3] - coords[1];
ocrWord.Rectangle = new RectangleF(x, y, width, height);
}
}

ocrLine.Add(ocrWord);
}

ocrPage.Add(ocrLine);
}

ocr.Add(ocrPage);
}
}
}
}
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# See https://aka.ms/customizecontainer to learn how to customize your debug container and how Visual Studio uses this Dockerfile to build your images for faster debugging.


# This stage is used when running from VS in fast mode (Default for Debug configuration)
FROM mcr.microsoft.com/dotnet/runtime:8.0 AS
RUN apt-get update && apt-get install -y tesseract-ocr
USER $APP_UID
WORKDIR /app


# This stage is used to build the service project
FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
ARG BUILD_CONFIGURATION=Release
WORKDIR /src
COPY ["OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.csproj", "OCR-with-Tesseract-in-Docker-on-Linux/"]
RUN dotnet restore "./OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.csproj"
COPY . .
WORKDIR "/src/OCR-with-Tesseract-in-Docker-on-Linux"
RUN dotnet build "./OCR-with-Tesseract-in-Docker-on-Linux.csproj" -c $BUILD_CONFIGURATION -o /app/build

# This stage is used to publish the service project to be copied to the final stage
FROM build AS publish
ARG BUILD_CONFIGURATION=Release
RUN dotnet publish "./OCR-with-Tesseract-in-Docker-on-Linux.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false

# This stage is used in production or when running from VS in regular mode (Default when not using the Debug configuration)
FROM base AS final
WORKDIR /app
COPY --from=publish /app/publish .
ENTRYPOINT ["dotnet", "OCR-with-Tesseract-in-Docker-on-Linux.dll"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
namespace OCR_with_Tesseract_in_Docker_on_Linux.Models
{
public class ErrorViewModel
{
public string? RequestId { get; set; }

public bool ShowRequestId => !string.IsNullOrEmpty(RequestId);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<Project Sdk="Microsoft.NET.Sdk.Web">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<RootNamespace>OCR_with_Tesseract_in_Docker_on_Linux</RootNamespace>
<UserSecretsId>16743565-eaf2-4e18-8eb6-e6ba08388c1f</UserSecretsId>
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.VisualStudio.Azure.Containers.Tools.Targets" Version="1.22.1" />
<PackageReference Include="Syncfusion.PDF.OCR.Net.Core" Version="31.2.3" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<ActiveDebugProfile>IIS Express</ActiveDebugProfile>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
<DebuggerFlavor>ProjectDebugger</DebuggerFlavor>
</PropertyGroup>
</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
var builder = WebApplication.CreateBuilder(args);

// Add services to the container.
builder.Services.AddControllersWithViews();

var app = builder.Build();

// Configure the HTTP request pipeline.
if (!app.Environment.IsDevelopment())
{
app.UseExceptionHandler("/Home/Error");
// The default HSTS value is 30 days. You may want to change this for production scenarios, see https://aka.ms/aspnetcore-hsts.
app.UseHsts();
}

app.UseHttpsRedirection();
app.UseStaticFiles();

app.UseRouting();

app.UseAuthorization();

app.MapControllerRoute(
name: "default",
pattern: "{controller=Home}/{action=Index}/{id?}");

app.Run();
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"profiles": {
"http": {
"commandName": "Project",
"launchBrowser": true,
"environmentVariables": {
"ASPNETCORE_ENVIRONMENT": "Development"
},
"dotnetRunMessages": true,
"applicationUrl": "http://localhost:5294"
},
"https": {
"commandName": "Project",
"launchBrowser": true,
"environmentVariables": {
"ASPNETCORE_ENVIRONMENT": "Development"
},
"dotnetRunMessages": true,
"applicationUrl": "https://localhost:7239;http://localhost:5294"
},
"IIS Express": {
"commandName": "IISExpress",
"launchBrowser": true,
"environmentVariables": {
"ASPNETCORE_ENVIRONMENT": "Development"
}
},
"Container (Dockerfile)": {
"commandName": "Docker",
"launchBrowser": true,
"launchUrl": "{Scheme}://{ServiceHost}:{ServicePort}",
"environmentVariables": {
"ASPNETCORE_HTTPS_PORTS": "8081",
"ASPNETCORE_HTTP_PORTS": "8080"
},
"publishAllPorts": true,
"useSSL": true
}
},
"$schema": "http://json.schemastore.org/launchsettings.json",
"iisSettings": {
"windowsAuthentication": false,
"anonymousAuthentication": true,
"iisExpress": {
"applicationUrl": "http://localhost:25397",
"sslPort": 44375
}
}
}
Loading
Loading