Word: Extract Embedded Office files, PDFs, and ZIPs
Office 365 files
Sometimes you get a lot of Word/Excel/Powerpoint docs with embedded DOCX, XLSX, VSDX, PPTX, PDFs, and ZIP files which you need to extract.
Here is a VBA macro that combs through those Office Docs and extracts them.
It also renames them something sensible (based on the file name of the originating doc).
To use it:
1. Copy all the Office Docs (must be docx or pptx or xlsx) to a single folder.
2. Unzip all the docx/pptx/xlsx files (these are just zip files which have been renamed) to this same folder.
3. Run the macro "start_extract_PDFs_from_folders"
If you have an old VSD that's embedded, you can find it in the embedded folder. LibreOffice does a fair job of opening old VSDs, but to convert it to VSDX you'll have to save as SVG, then open the SVG inside Visio.
TODO see if this also works: https://www.datanumen.com/blogs/2-quick-ways-extract-ms-office-files-embedded-word-document/
Sub start_extract_PDFs_from_folders()
' Extracts embedded PDFs from multiple folders within a folder.
' 2022-04-28 Edward Chan
' To use, unzip all the Word Files in a single folder, then run this macro.
' Folder traverse from https://stackoverflow.com/questions/22645347/loop-through-all-subfolders-using-vba
' PDF Extract from https://stackoverflow.com/questions/64157886/how-to-extract-embedded-pdf-from-word-document-in-linux-mac/64157887?utm_source=pocket_mylist
Dim FileSystem As Object
Dim HostFolder As String
Dim dlgOpen As FileDialog
Set dlgOpen = Application.FileDialog( _
FileDialogType:=msoFileDialogFolderPicker)
With dlgOpen
.AllowMultiSelect = True
.Title = "Select the folder containing all the unzipped docx folders to extract PDF file(s) from"
.InitialFileName = "*.docx"
.Show
End With
HostFolder = dlgOpen.SelectedItems.Item(1)
Set FileSystem = CreateObject("Scripting.FileSystemObject")
Call DoFolder(FileSystem.GetFolder(HostFolder), "word")
Call DoFolder(FileSystem.GetFolder(HostFolder), "xl")
Call DoFolder(FileSystem.GetFolder(HostFolder), "visio")
Call DoFolder(FileSystem.GetFolder(HostFolder), "ppt")
End Sub
Sub DoFolder(Folder, app As String)
Dim SubFolder
For Each SubFolder In Folder.SubFolders
'DoFolder SubFolder ' Uncomment this for recursive
Debug.Print SubFolder
Dim Contents As String
Dim PDF As String
Dim ZIP As String
Dim hFile As Integer
Dim i As Long, j As Long
Dim ExtractedZippedDocxFolder, FileNameBin, FileNamePDF, BinFolderPath, FileNameNew, FolderUp As String
Dim fileIndex As Integer
' Dim dlgOpen As FileDialog
' Set dlgOpen = Application.FileDialog( _
' FileDialogType:=msoFileDialogFolderPicker)
' With dlgOpen
' .AllowMultiSelect = True
' .Title = "Select the unzipped docx folder to extract PDF file(s) from"
' .InitialFileName = "*.docx"
' .Show
' End With
' ExtractedZippedDocxFolder = dlgOpen.SelectedItems.Item(1)
ExtractedZippedDocxFolder = SubFolder
BinFolderPath = ExtractedZippedDocxFolder + "\" & app & "\embeddings"
Set objFSO = CreateObject("Scripting.FileSystemObject")
If objFSO.FolderExists(BinFolderPath) = True Then ' There are embedded objects
Set objFolder = objFSO.GetFolder(BinFolderPath)
fileIndex = 0
For Each objFile In objFolder.Files
If LCase$(Right$(objFile.Name, 4)) = ".bin" Then
FileNameIndex = Left$(objFile.Name, Len(objFile.Name) - Len(".bin"))
FileNameBin = BinFolderPath + "\" + FileNameIndex + ".bin"
FileNamePDF = BinFolderPath + "\" + FileNameIndex + ".pdf"
hFile = FreeFile
Open FileNameBin For Binary Access Read As #hFile
Contents = String(LOF(hFile), vbNullChar)
Get #hFile, , Contents
Close #hFile
i = InStrB(1, Contents, "%PDF")
If i <> 0 Then ' It is a PDF
j = InStrB(i, Contents, "%%EOF")
If (InStrB(j + 1, Contents, "%%EOF") > 0) Then j = InStrB(j + 1, Contents, "%%EOF")
PDF = MidB(Contents, i, j + 5 - i + 12)
Open FileNamePDF For Binary Access Write As #hFile
Put #hFile, , PDF
Close #hFile
FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))
FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".pdf"
Name FileNamePDF As FileNameNew
fileIndex = fileIndex + 1
Else
' Zip files
i = InStrB(1, Contents, "PK" & Chr(3) & Chr(4))
If i <> 0 Then ' It is a zip file. Just find the start and go to the end. There will be a little bit of garbage at the end but it'll be ignored by zip programs. It's too complicated to figure out where a ZIP file ends!!
ZIP = MidB(Contents, i, Len(Contents))
FileNameZIP = BinFolderPath + "\" + FileNameIndex + ".zip"
Open FileNameZIP For Binary Access Write As #hFile
Put #hFile, , ZIP
Close #hFile
FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))
FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".zip"
Name FileNameZIP As FileNameNew
fileIndex = fileIndex + 1
End If
End If
Else
' Check for docx files
If LCase$(Right$(objFile.Name, 5)) = ".docx" Then
FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))
FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".docx"
FileCopy objFile.Path, FileNameNew
fileIndex = fileIndex + 1
End If
' Check for xlsx files
If LCase$(Right$(objFile.Name, 5)) = ".xlsx" Then
FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))
FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".xlsx"
FileCopy objFile.Path, FileNameNew
fileIndex = fileIndex + 1
End If
' Check for pptx files
If LCase$(Right$(objFile.Name, 5)) = ".pptx" Then
FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))
FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".pptx"
FileCopy objFile.Path, FileNameNew
fileIndex = fileIndex + 1
End If
' Check for vsdx files
If LCase$(Right$(objFile.Name, 5)) = ".vsdx" Then
FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))
FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".vsdx"
FileCopy objFile.Path, FileNameNew
fileIndex = fileIndex + 1
End If
End If
Next
If fileIndex = 0 Then
Debug.Print "Unable to find any bin file in the givven unzipped docx file content"
Else
Debug.Print Str(fileIndex) + " files were processed"
End If
End If
Next
' Dim File
' For Each File In Folder.Files
' Next
End Sub
Sub export_PDFs()
' This is for a single file only
Dim Contents As String
Dim PDF As String
Dim hFile As Integer
Dim i As Long, j As Long
Dim ExtractedZippedDocxFolder, FileNameBin, FileNamePDF, BinFolderPath As String
Dim fileIndex As Integer
Dim dlgOpen As FileDialog
Set dlgOpen = Application.FileDialog( _
FileDialogType:=msoFileDialogFolderPicker)
With dlgOpen
.AllowMultiSelect = False
.Title = "Select the unzipped docx folder to extract PDF file(s) from"
.InitialFileName = "*.docx"
.Show
End With
ExtractedZippedDocxFolder = dlgOpen.SelectedItems.Item(1)
BinFolderPath = ExtractedZippedDocxFolder + "\word\embeddings"
Set objFSO = CreateObject("Scripting.FileSystemObject")
Set objFolder = objFSO.GetFolder(BinFolderPath)
fileIndex = 0
For Each objFile In objFolder.Files
If LCase$(Right$(objFile.Name, 4)) = ".bin" Then
FileNameIndex = Left$(objFile.Name, Len(objFile.Name) - Len(".bin"))
FileNameBin = BinFolderPath + "\" + FileNameIndex + ".bin"
FileNamePDF = BinFolderPath + "\" + FileNameIndex + ".pdf"
hFile = FreeFile
Open FileNameBin For Binary Access Read As #hFile
Contents = String(LOF(hFile), vbNullChar)
Get #hFile, , Contents
Close #hFile
i = InStrB(1, Contents, "%PDF")
If i <> 0 Then ' It is a PDF
j = InStrB(i, Contents, "%%EOF")
If (InStrB(j + 1, Contents, "%%EOF") > 0) Then j = InStrB(j + 1, Contents, "%%EOF")
PDF = MidB(Contents, i, j + 5 - i + 12)
Open FileNamePDF For Binary Access Write As #hFile
Put #hFile, , PDF
Close #hFile
fileIndex = fileIndex + 1
End If
End If
Next
If fileIndex = 0 Then
MsgBox "Unable to find any bin file in the givven unzipped docx file content"
Else
MsgBox Str(fileIndex) + " files were processed"
End If
End Sub
Other Embedded file formats
For STP files (not sure other files), you can just:
Right-click > Copy
Open Explorer folder > Paste
And magically the STP file will appear!
Not sure if this works for other file formats (doesn't seem to work with PDFs).
Below is a macro that does this automatically. Just run the sub ProcessWordDocumentsOle and choose the folder with docx files, no unzipping needed.
Sub CopyOutOLEFiles(doc As Document)
' This is for a single file only
Dim ole As OLEFormat
Dim obj As Object
Dim i As Integer
Dim shello As Object
Dim outputFolder As String
outputFolder = doc.Path
' Check each object in the active document
For Each obj In doc.InlineShapes
' Check if the object is an OLE object
If obj.Type = wdInlineShapeEmbeddedOLEObject Or obj.Type = wdInlineShapeLinkedOLEObject Then
Set ole = obj.OLEFormat
' Try to print the source name (filename)
On Error Resume Next ' Skip if error occurs
If Right(ole.IconLabel, 3) = "pdf" Or Right(ole.IconLabel, 4) = "docx" Or Right(ole.IconLabel, 4) = "xlsx" Or Right(ole.IconLabel, 4) = "vsdx" Then
' Do nothing
Else
Debug.Print ole.IconLabel
obj.Select
Selection.Copy
CreateObject("Shell.Application").Namespace(outputFolder & "\").Self.InvokeVerb "Paste"
oldFilePath = outputFolder & "\" & ole.IconLabel
newFilePath = outputFolder & "\" & doc.Name & "_" & ole.IconLabel
Name oldFilePath As newFilePath ' Sometimes it can't rename it because it is given a different filename than the IconLabel!!
On Error GoTo 0 ' Stop ignoring errors
End If
DoEvents
End If
i = i + 1
Next obj
End Sub
Sub ProcessWordDocumentsOle()
Dim fd As FileDialog
Dim selectedFolder As String
Dim wordFile As String
Dim doc As Document
Dim FileSystem As Object
' Create a FileDialog object as a Folder Picker dialog box.
Set fd = Application.FileDialog(msoFileDialogFolderPicker)
If fd.Show = -1 Then
selectedFolder = fd.SelectedItems(1) ' Get the selected folder path
' Make sure the folder path ends with a backslash
If Right(selectedFolder, 1) <> "\" Then selectedFolder = selectedFolder & "\"
' Set the path of the first Word document (.doc or .docx)
wordFile = Dir(selectedFolder & "*.doc*")
While wordFile <> ""
' Open the Word document
Set doc = Documents.Open(FileName:=selectedFolder & wordFile)
' Call the sample function to process the document
Call CopyOutOLEFiles(doc)
' Save and close the document
doc.Close SaveChanges:=wdDoNotSaveChanges
' Get the next Word document
wordFile = Dir
Wend
MsgBox "All Word documents in the selected folder have been processed."
Else
MsgBox "No folder was selected."
End If
' Clean up
Set fd = Nothing
End Sub