diff --git a/Example/Example04.ExtractText/Example04.ps1 b/Example/Example04.ExtractText/Example04.ps1 index 8b2a94f..00e8c39 100644 --- a/Example/Example04.ExtractText/Example04.ps1 +++ b/Example/Example04.ExtractText/Example04.ps1 @@ -1,11 +1,16 @@ -Import-Module .\PSWritePDF.psd1 -Force +Import-Module .\PSWritePDF.psd1 -Force -# Get all pages text -Convert-PDFToText -FilePath "$PSScriptRoot\Example04.pdf" +# Get all pages text as objects +$pages = Convert-PDFToText -FilePath "$PSScriptRoot\Example04.pdf" +$pages | Format-Table -AutoSize -Convert-PDFToText -FilePath "$PSScriptRoot\Example04.pdf" -ExtractionStrategy LocationTextExtractionStrategy +# Save combined text to file +Convert-PDFToText -FilePath "$PSScriptRoot\Example04.pdf" -OutFile "$PSScriptRoot\Example04.txt" +# Use different extraction strategies +Convert-PDFToText -FilePath "$PSScriptRoot\Example04.pdf" -ExtractionStrategy LocationTextExtractionStrategy Convert-PDFToText -FilePath "$PSScriptRoot\Example04.pdf" -ExtractionStrategy SimpleTextExtractionStrategy # Get page 1 text only -Convert-PDFToText -FilePath "$PSScriptRoot\Example04.pdf" -Page 1 -IgnoreProtection \ No newline at end of file +Convert-PDFToText -FilePath "$PSScriptRoot\Example04.pdf" -Page 1 -IgnoreProtection + diff --git a/Sources/PSWritePDF/Cmdlets/CmdletConvertPDFToText.cs b/Sources/PSWritePDF/Cmdlets/CmdletConvertPDFToText.cs index f22b029..78b2b21 100644 --- a/Sources/PSWritePDF/Cmdlets/CmdletConvertPDFToText.cs +++ b/Sources/PSWritePDF/Cmdlets/CmdletConvertPDFToText.cs @@ -3,13 +3,14 @@ using System.IO; using System.Linq; using System.Management.Automation; +using System.Text; using iText.Kernel.Pdf; using iText.Kernel.Pdf.Canvas.Parser; namespace PSWritePDF.Cmdlets; [Cmdlet(VerbsData.Convert, "PDFToText")] -[OutputType(typeof(string))] +[OutputType(typeof(PSObject))] public class CmdletConvertPDFToText : PSCmdlet { [Parameter(Mandatory = true, ValueFromPipeline = true, ValueFromPipelineByPropertyName = true)] @@ -25,6 +26,9 @@ public class CmdletConvertPDFToText : PSCmdlet [Parameter] public SwitchParameter IgnoreProtection { get; set; } + [Parameter] + public string? OutFile { get; set; } + protected override void ProcessRecord() { if (!File.Exists(FilePath)) @@ -43,6 +47,8 @@ protected override void ProcessRecord() int pagesCount = pdf.GetNumberOfPages(); IEnumerable pages = Page.Length == 0 ? Enumerable.Range(1, pagesCount) : Page; + var collectedTexts = new List(); + foreach (int pageNum in pages) { if (pageNum < 1 || pageNum > pagesCount) @@ -56,13 +62,32 @@ protected override void ProcessRecord() var page = pdf.GetPage(pageNum); var strategy = ExtractionStrategy.ToStrategy(); string text = PdfTextExtractor.GetTextFromPage(page, strategy); - WriteObject(text); + + var outputObject = new PSObject(); + outputObject.TypeNames.Insert(0, "System.Management.Automation.PSCustomObject"); + outputObject.Properties.Add(new PSNoteProperty("PageNumber", pageNum)); + outputObject.Properties.Add(new PSNoteProperty("Text", text)); + WriteObject(outputObject); + collectedTexts.Add(text); } catch (Exception ex) { WriteWarning($"Processing document '{FilePath}' failed with error: {ex.Message}"); } } + + if (!string.IsNullOrWhiteSpace(OutFile)) + { + try + { + var combined = string.Join(Environment.NewLine, collectedTexts); + File.WriteAllText(OutFile, combined, Encoding.UTF8); + } + catch (Exception ex) + { + WriteWarning($"Saving file '{OutFile}' failed with error: {ex.Message}"); + } + } } } diff --git a/Tests/Convert-PDFToText.Tests.ps1 b/Tests/Convert-PDFToText.Tests.ps1 index e93aeff..dc8b293 100644 --- a/Tests/Convert-PDFToText.Tests.ps1 +++ b/Tests/Convert-PDFToText.Tests.ps1 @@ -1,12 +1,22 @@ Describe 'Convert-PDFToText' { - It 'extracts text from PDF' { + It 'returns objects with PageNumber and Text' { $file = Join-Path $PSScriptRoot 'Input' 'SampleAcroForm.pdf' $text = Convert-PDFToText -FilePath $file - ($text -join "`n") | Should -Match 'Text 1' + $text | Should -AllBeOfType [pscustomobject] + $text[0].PageNumber | Should -Be 1 + $text[0].Text | Should -Match 'Text 1' } It 'accepts piped Get-ChildItem results' { $files = Get-ChildItem -Path (Join-Path $PSScriptRoot 'Input') -Filter '*.pdf' $text = $files | Convert-PDFToText - ($text -join "`n") | Should -Match 'Text 1' + ($text | Select-Object -ExpandProperty Text | Out-String) | Should -Match 'Text 1' + } + It 'creates output file with combined text' { + $file = Join-Path $PSScriptRoot 'Input' 'SampleAcroForm.pdf' + $outFile = Join-Path $TestDrive 'output.txt' + $result = Convert-PDFToText -FilePath $file -OutFile $outFile + Test-Path $outFile | Should -BeTrue + ($result | Select-Object -ExpandProperty Text | Out-String) | Should -Match 'Text 1' + (Get-Content $outFile -Raw) | Should -Match 'Text 1' } }