function convert-pdfdict2hash { param ([System.IO.FileStream]$pdstream, [long]$pdoffset) #converts << >> delimited pdf dictionary to hash. Called from convert-pdf2txt and show-pdfcontent $PDResult=@{} [byte[]]$pdbuffer=@(0xff,0xff) [bool]$addtohash=$false $line='' for ($pdfPos=$pdoffset+1;$pdfPos -lt $pdstream.length;$pdfpos=$pdfpos+2) { $pdstream.position=$pdfpos $iChars=$pdstream.read($pdbuffer,0,$pdbuffer.length) > $nul $line=$line+([char]$pdbuffer[0])+([char]$pdbuffer[1]) $line=$line.replace([char]10+[char]13,[char]10) $line=$line.replace([char]13+[char]10,[char]10) $line=$line.replace([char]13,[char]10) if ($pdbuffer[0] -eq 10 -or $pdbuffer[1] -eq 10 -or $pdbuffer[0] -eq 13 -or $pdbuffer[1] -eq 13) { $newline='' if ($pdbuffer[0] -eq 10 -or $pdbuffer[0] -eq 13) {$newline=([char]$pdbuffer[1])} $line=$line.split([char]10)[0].trim() if ($line -eq '>>') { $pdfPos = $pdstream.length } elseif ($line -eq '<<') { $addtohash=$true } elseif ($addtohash) { $pdKey=$line.split(' ')[0].replace('/','') $valueStart=$line.indexof($pdKey)+$pdkey.length $pdValue=($line.substring($valueStart,$line.length-$valueStart)).replace('<<','').replace('[','').replace(']','').trim() #add code to handle data types dictionary and array #dictionary within dictionary #if ($pdValue.contains('<<') { # $pdValue=convert-pdfdict2hash $pdstream $pdOffset #} #array #elseif ($pdvalue.contains('[') { # #} if (!$pdResult.containskey($pdKey)) { $pdResult.add($pdKey,$pdValue) } else { $pdResult[$pdKey]=$pdResult[$pdKey]+'%'+$pdValue } } $line=$newline } } $PDResult } function show-pdfcontent { #requires function convert-pdfdict2hash. Called from convert-pdf2txt param ([System.IO.FileStream]$scfilestream, [long]$iscoffset) $dContentInfo=@{} $iContentLength=0 $sContentFilter='' $line='' [byte[]]$scbuffer=@(0xff,0xff) $dContentInfo=convert-pdfdict2hash $scfilestream $iscoffset $iContentLength=$dContentInfo['length'] if ($dContentInfo.containskey('filter')) { $sContentFilter=$dContentInfo['filter'] switch ($sContentFilter.tolower()) { {"/flatedecode"} { #Get Stream offset for ($iFindStream=$iscoffset+1;$iFindStream -lt $scfilestream.length;$iFindStream=$iFindStream+2) { $scfilestream.position=$iFindStream $iChars=$scfilestream.read($scbuffer,0,$scbuffer.length) > $nul $line=$line+([char]$scbuffer[0])+([char]$scbuffer[1]) if ($scbuffer[0] -eq 10 -or $scbuffer[1] -eq 10) { $newline='' if ($scbuffer[0] -eq 10) {$newline=([char]$scbuffer[1])} $line=$line.split([char]10)[0].trim() #write-host $line if ($line.tolower().contains('stream')) { #Header Can be two or six bytes??? See RFC 1950 & 1951 and http://blogs.msdn.com/bclteam/archive/2007/05/16/system-io-compression-capabilities-kim-hamilton.aspx. $iStreamOffset=$iFindStream + 4 if ($newline) {$iStreamOffset = $iStreamOffset - 1} #$iFindStream needs to be decremented by 1 if stream fell between two bytes we're reading. $iFindStream=$scfileStream.length } $line=$newline } } #Deflate zlib compressed stream #"Stream offset: $($iStreamOffset)" $scfilestream.position=$iStreamOffset $decompressed=new-object System.IO.Compression.DeflateStream -argumentlist $scfilestream,Decompress $sdecomp='' for ($iSC=1;$iSC -lt $iContentLength;$iSC++) { $decompressed.read($scbuffer,0,$scbuffer.length) > $nul $sdecomp=$sdecomp+[char]$scbuffer[0]+[char]$scbuffer[1] #$scbuffer|%{write-host -nonewline ([char]$_)} } $finalresult='' foreach ($sLine in $sDecomp.split([char]10)) { #$sline if ($sline -match "^\((.*)\) (T.|\'|\"")$") { #$finalresult=$matches[1] $matches[1] } } } } #switch } } function convert-pdf2txt { #Requires function: convert-pdfdict2hash param ([string]$pdfsource, [string]$txtdest) if ($pdfsource.length -lt 4) {$pdfsource=$pdfsource+'.pdf'} if (!$pdfsource.endswith('.pdf')) {$pdfsource=$pdfsource+'.pdf'} if (!$txtdest) {$txtdest=$pdfsource.substring(0,$pdfsource.length - 4)+'.txt'} if (!(test-path $pdfsource)) { "Couldn't find file $($pdfsource)" break } $pdfsource=([System.IO.FileInfo]"$($pdfsource)").fullname trap { if ($_.Exception.GetType().Name -eq "FileNotFoundException") { "File not found exception" "Try copying the file to a local drive." break } else { "trapped " + $_.Exception.GetType().Name + " in script "+ $error[0].invocationinfo.scriptname + " at line " + $error[0].invocationinfo.scriptlinenumber + " offset "+ $error[0].invocationinfo.OffsetInLine +":" $error[0] } } $pdfstream=[IO.File]::OpenRead($pdfsource) [byte[]]$pdfbuffer=@(0xff,0xff) #Get xref into hash for ($pdfPos=$pdfstream.length-2;$pdfPos -gt 1;$pdfpos=$pdfpos-2) { $pdfstream.position=$pdfPos $iChars=$pdfstream.read($pdfbuffer,0,$pdfbuffer.length) > $nul $tail=([char]$pdfbuffer[0])+([char]$pdfbuffer[1])+$tail if ($tail.tolower().contains("startxref")) {$pdfpos=0} } $tail=$tail.replace([char]10+[char]13,[char]10) $tail=$tail.replace([char]13+[char]10,[char]10) $tail=$tail.replace([char]13,[char]10) foreach ($ln in $tail.split([char]10)) { if ($ln -match "^[0-9].*$") { $xrefoffset=[int]$matches[0] } } if (!$xrefoffset) {"Couldn't get xref offset. Quitting";break} #$xrefoffset=[int]$tail.split([char]10)[2] $xref=@() for ($pdfPos=$xrefoffset;$pdfPos -lt $pdfstream.length;$pdfpos=$pdfpos+2) { $pdfstream.position=$pdfpos $iChars=$pdfstream.read($pdfbuffer,0,$pdfbuffer.length) > $nul $line=$line+([char]$pdfbuffer[0])+([char]$pdfbuffer[1]) $line=$line.replace([char]10+[char]13,[char]10) $line=$line.replace([char]13+[char]10,[char]10) $line=$line.replace([char]13,[char]10) if ($line.contains([char]10)) { $xref+=$line.replace([char]10,'').trim() $line='' } } $dxref=@{} for ($i=[int]$xref[1].split(' ')[0];$i -lt [int]$xref[1].split(' ')[1];$i++) { $dxref.add([int]$i,[long]$xref[$i+2].split(' ')[0]) } #Get root obj # from trailer for ($pdfPos=$pdfstream.length-2;$pdfPos -gt 1;$pdfpos=$pdfpos-2) { $pdfstream.position=$pdfPos $iChars=$pdfstream.read($pdfbuffer,0,$pdfbuffer.length) $tail=([char]$pdfbuffer[0])+([char]$pdfbuffer[1])+$tail if ($tail.tolower().contains("railer")) {$pdfpos=0} } $tail=$tail.replace([char]10+[char]13,[char]10) $tail=$tail.replace([char]13+[char]10,[char]10) $tail=$tail.replace([char]13,[char]10) foreach ($trailerLine in $tail.split([char]10)) { $trailerLine=$trailerLine.tolower().trim() if ($trailerline.startswith("/root ")) {$iRootObj=$trailerLine.split(' ')[1]} } #Get Root object offset from xref hash #"Root offset: $($dxref[[int]$iRootObj])" $rootOffset=$dxref[[int]$iRootObj] if ($rootOffset) { #"Pages: $($dRoot['Pages'])" $dRoot=convert-pdfdict2hash $pdfstream $rootOffset $iPagesObjectNum=$dRoot['Pages'].split(' ')[0] #"Pages Offset: $($dxref[[int]$iPagesObjectNum])" $iPagesOffset=$dxref[[int]$iPagesObjectNum] $dPages=convert-pdfdict2hash $pdfstream $iPagesOffset $kids=$dPages['Kids'].split(' ') #Every 3rd entry in $kids is object number of page for ($iKid=0;$iKid -lt $kids.length;$iKid=$iKid+3) { $iPageObjectNum=$kids[$ikid] $iPageOffset=$dxref[[int]$iPageObjectNum] $dPage=@{} $dPage=convert-pdfdict2hash $pdfstream $iPageOffset $contents=$dPage['Contents'] if ($contents.contains(' ')) { $contents=$contents.split(' ') #every 3rd entry is object number of content object for ($icontentnum=0;$icontentnum -lt $contents.length;$icontentnum=$icontentnum+3) { $iContentObjectNum=$contents[$iContentnum] $icontentOffset=$dxref[[int]$iContentObjectNum] #"Content offset: $($iContentOffset)" show-pdfcontent $pdfstream $iContentOffset } } } } #Can't find root offset and page structure. Read through file to find streams and send to show-pdfcontent function. else { "Document may be encrypted!";break $thislineoffset=0 for ($pdfPos=0;$pdfPos -lt $pdfstream.length;$pdfpos=$pdfpos+2) { $pdfstream.position=$pdfpos $iChars=$pdfstream.read($pdfbuffer,0,$pdfbuffer.length) > $nul $line=$line+([char]$pdfbuffer[0])+([char]$pdfbuffer[1]) $line=$line.replace([char]10+[char]13,[char]10) $line=$line.replace([char]13+[char]10,[char]10) $line=$line.replace([char]13,[char]10) if ($pdfbuffer[0] -eq 10 -or $pdfbuffer[1] -eq 10 -or $pdfbuffer[0] -eq 13 -or $pdfbuffer[1] -eq 13) { $newline='' if ($pdfbuffer[0] -eq 10 -or $pdfbuffer[0] -eq 13) {$newline=([char]$pdfbuffer[1])} $line=$line.split([char]10)[0].trim() #write-host $line if ($line.tolower().contains(' obj')) { $lastobj=$thislineoffset } if ($line.tolower().contains('stream')) { show-pdfcontent $pdfstream $lastObj } $line=$newline $thislineoffset=$pdfpos+2 if ($pdfbuffer[0] -eq 10 -or $pdfbuffer[0] -eq 13) {$thislineoffset--} } } } #$pdfstream.position=$rootOffset #$iChars=$pdfstream.read($pdfbuffer,0,$pdfbuffer.length) #$tail=([char]$pdfbuffer[0])+([char]$pdfbuffer[1]) #$tail } $usage="type-pdf.ps1 `nWill display unformatted plain text from pdf file if the file uses zlib compression." if ($args.length -ne 1) {$usage;break} if ($args[0].contains('?') -or $args[0].contains('*')) {$usage;break} if ($args.count -eq 1) { convert-pdf2txt $args[0] } else { convert-pdf2txt $args[0] $args[1] }