Word: Extract Embedded Office files, PDFs, and ZIPs
Sometimes you get a lot of Word/Excel/Powerpoint docs with embedded DOCX, XLSX, VSDX, PPTX, PDFs, and ZIP files which you need to extract.
Here is a VBA macro that combs through those Office Docs and extracts them.
It also renames them something sensible (based on the file name of the originating doc).
To use it:
1. Copy all the Office Docs (must be docx or pptx or xlsx) to a single folder.
2. Unzip all the docx/pptx/xlsx files (these are just zip files which have been renamed) to this same folder.
3. Run the macro "start_extract_PDFs_from_folders"
Sub start_extract_PDFs_from_folders()
' Extracts embedded PDFs from multiple folders within a folder.
' 2022-04-28 Edward Chan
' To use, unzip all the Word Files in a single folder, then run this macro.
' Folder traverse from https://stackoverflow.com/questions/22645347/loop-through-all-subfolders-using-vba
' PDF Extract from https://stackoverflow.com/questions/64157886/how-to-extract-embedded-pdf-from-word-document-in-linux-mac/64157887?utm_source=pocket_mylist
Dim FileSystem As Object
Dim HostFolder As String
Dim dlgOpen As FileDialog
Set dlgOpen = Application.FileDialog( _
FileDialogType:=msoFileDialogFolderPicker)
With dlgOpen
.AllowMultiSelect = True
.Title = "Select the folder containing all the unzipped docx folders to extract PDF file(s) from"
.InitialFileName = "*.docx"
.Show
End With
HostFolder = dlgOpen.SelectedItems.Item(1)
Set FileSystem = CreateObject("Scripting.FileSystemObject")
Call DoFolder(FileSystem.GetFolder(HostFolder), "word")
Call DoFolder(FileSystem.GetFolder(HostFolder), "xl")
Call DoFolder(FileSystem.GetFolder(HostFolder), "visio")
Call DoFolder(FileSystem.GetFolder(HostFolder), "ppt")
End Sub
Sub DoFolder(Folder, app As String)
Dim SubFolder
For Each SubFolder In Folder.SubFolders
'DoFolder SubFolder ' Uncomment this for recursive
Debug.Print SubFolder
Dim Contents As String
Dim PDF As String
Dim ZIP As String
Dim hFile As Integer
Dim i As Long, j As Long
Dim ExtractedZippedDocxFolder, FileNameBin, FileNamePDF, BinFolderPath, FileNameNew, FolderUp As String
Dim fileIndex As Integer
' Dim dlgOpen As FileDialog
' Set dlgOpen = Application.FileDialog( _
' FileDialogType:=msoFileDialogFolderPicker)
' With dlgOpen
' .AllowMultiSelect = True
' .Title = "Select the unzipped docx folder to extract PDF file(s) from"
' .InitialFileName = "*.docx"
' .Show
' End With
' ExtractedZippedDocxFolder = dlgOpen.SelectedItems.Item(1)
ExtractedZippedDocxFolder = SubFolder
BinFolderPath = ExtractedZippedDocxFolder + "\" & app & "\embeddings"
Set objFSO = CreateObject("Scripting.FileSystemObject")
If objFSO.FolderExists(BinFolderPath) = True Then ' There are embedded objects
Set objFolder = objFSO.GetFolder(BinFolderPath)
fileIndex = 0
For Each objFile In objFolder.Files
If LCase$(Right$(objFile.Name, 4)) = ".bin" Then
FileNameIndex = Left$(objFile.Name, Len(objFile.Name) - Len(".bin"))
FileNameBin = BinFolderPath + "\" + FileNameIndex + ".bin"
FileNamePDF = BinFolderPath + "\" + FileNameIndex + ".pdf"
hFile = FreeFile
Open FileNameBin For Binary Access Read As #hFile
Contents = String(LOF(hFile), vbNullChar)
Get #hFile, , Contents
Close #hFile
i = InStrB(1, Contents, "%PDF")
If i <> 0 Then ' It is a PDF
j = InStrB(i, Contents, "%%EOF")
If (InStrB(j + 1, Contents, "%%EOF") > 0) Then j = InStrB(j + 1, Contents, "%%EOF")
PDF = MidB(Contents, i, j + 5 - i + 12)
Open FileNamePDF For Binary Access Write As #hFile
Put #hFile, , PDF
Close #hFile
FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))
FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".pdf"
Name FileNamePDF As FileNameNew
fileIndex = fileIndex + 1
Else
' Zip files
i = InStrB(1, Contents, "PK" & Chr(3) & Chr(4))
If i <> 0 Then ' It is a zip file. Just find the start and go to the end. There will be a little bit of garbage at the end but it'll be ignored by zip programs. It's too complicated to figure out where a ZIP file ends!!
ZIP = MidB(Contents, i, Len(Contents))
FileNameZIP = BinFolderPath + "\" + FileNameIndex + ".zip"
Open FileNameZIP For Binary Access Write As #hFile
Put #hFile, , ZIP
Close #hFile
FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))
FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".zip"
Name FileNameZIP As FileNameNew
fileIndex = fileIndex + 1
End If
End If
Else
' Check for docx files
If LCase$(Right$(objFile.Name, 5)) = ".docx" Then
FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))
FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".docx"
FileCopy objFile.Path, FileNameNew
fileIndex = fileIndex + 1
End If
' Check for xlsx files
If LCase$(Right$(objFile.Name, 5)) = ".xlsx" Then
FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))
FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".xlsx"
FileCopy objFile.Path, FileNameNew
fileIndex = fileIndex + 1
End If
' Check for pptx files
If LCase$(Right$(objFile.Name, 5)) = ".pptx" Then
FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))
FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".pptx"
FileCopy objFile.Path, FileNameNew
fileIndex = fileIndex + 1
End If
' Check for vsdx files
If LCase$(Right$(objFile.Name, 5)) = ".vsdx" Then
FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))
FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".vsdx"
FileCopy objFile.Path, FileNameNew
fileIndex = fileIndex + 1
End If
End If
Next
If fileIndex = 0 Then
Debug.Print "Unable to find any bin file in the givven unzipped docx file content"
Else
Debug.Print Str(fileIndex) + " files were processed"
End If
End If
Next
' Dim File
' For Each File In Folder.Files
' Next
End Sub
Sub export_PDFs()
' This is for a single file only
Dim Contents As String
Dim PDF As String
Dim hFile As Integer
Dim i As Long, j As Long
Dim ExtractedZippedDocxFolder, FileNameBin, FileNamePDF, BinFolderPath As String
Dim fileIndex As Integer
Dim dlgOpen As FileDialog
Set dlgOpen = Application.FileDialog( _
FileDialogType:=msoFileDialogFolderPicker)
With dlgOpen
.AllowMultiSelect = False
.Title = "Select the unzipped docx folder to extract PDF file(s) from"
.InitialFileName = "*.docx"
.Show
End With
ExtractedZippedDocxFolder = dlgOpen.SelectedItems.Item(1)
BinFolderPath = ExtractedZippedDocxFolder + "\word\embeddings"
Set objFSO = CreateObject("Scripting.FileSystemObject")
Set objFolder = objFSO.GetFolder(BinFolderPath)
fileIndex = 0
For Each objFile In objFolder.Files
If LCase$(Right$(objFile.Name, 4)) = ".bin" Then
FileNameIndex = Left$(objFile.Name, Len(objFile.Name) - Len(".bin"))
FileNameBin = BinFolderPath + "\" + FileNameIndex + ".bin"
FileNamePDF = BinFolderPath + "\" + FileNameIndex + ".pdf"
hFile = FreeFile
Open FileNameBin For Binary Access Read As #hFile
Contents = String(LOF(hFile), vbNullChar)
Get #hFile, , Contents
Close #hFile
i = InStrB(1, Contents, "%PDF")
If i <> 0 Then ' It is a PDF
j = InStrB(i, Contents, "%%EOF")
If (InStrB(j + 1, Contents, "%%EOF") > 0) Then j = InStrB(j + 1, Contents, "%%EOF")
PDF = MidB(Contents, i, j + 5 - i + 12)
Open FileNamePDF For Binary Access Write As #hFile
Put #hFile, , PDF
Close #hFile
fileIndex = fileIndex + 1
End If
End If
Next
If fileIndex = 0 Then
MsgBox "Unable to find any bin file in the givven unzipped docx file content"
Else
MsgBox Str(fileIndex) + " files were processed"
End If
End Sub