Word: Extract Embedded Office files, PDFs, and ZIPs

Office 365 files

Sometimes you get a lot of Word/Excel/Powerpoint docs with embedded DOCX, XLSX, VSDX, PPTX, PDFs, and ZIP files which you need to extract.

Here is a VBA macro that combs through those Office Docs and extracts them.

It also renames them something sensible (based on the file name of the originating doc).

To use it:

1. Copy all the Office Docs (must be docx or pptx or xlsx) to a single folder.

2. Unzip all the docx/pptx/xlsx files (these are just zip files which have been renamed) to this same folder.

3. Run the macro "start_extract_PDFs_from_folders"

If you have an old VSD that's embedded, you can find it in the embedded folder. LibreOffice does a fair job of opening old VSDs, but to convert it to VSDX you'll have to save as SVG, then open the SVG inside Visio.


TODO see if this also works: https://www.datanumen.com/blogs/2-quick-ways-extract-ms-office-files-embedded-word-document/


Sub start_extract_PDFs_from_folders()


' Extracts embedded PDFs from multiple folders within a folder.

' 2022-04-28 Edward Chan

' To use, unzip all the Word Files in a single folder, then run this macro.



' Folder traverse from https://stackoverflow.com/questions/22645347/loop-through-all-subfolders-using-vba

' PDF Extract from https://stackoverflow.com/questions/64157886/how-to-extract-embedded-pdf-from-word-document-in-linux-mac/64157887?utm_source=pocket_mylist



    

    Dim FileSystem As Object

    Dim HostFolder As String

    

    

    Dim dlgOpen As FileDialog

    Set dlgOpen = Application.FileDialog( _

    FileDialogType:=msoFileDialogFolderPicker)

    With dlgOpen

        .AllowMultiSelect = True

        .Title = "Select the folder containing all the unzipped docx folders to extract PDF file(s) from"

        .InitialFileName = "*.docx"

        .Show

    End With


    HostFolder = dlgOpen.SelectedItems.Item(1)


    Set FileSystem = CreateObject("Scripting.FileSystemObject")

    Call DoFolder(FileSystem.GetFolder(HostFolder), "word")

    Call DoFolder(FileSystem.GetFolder(HostFolder), "xl")

    Call DoFolder(FileSystem.GetFolder(HostFolder), "visio")

    Call DoFolder(FileSystem.GetFolder(HostFolder), "ppt")

End Sub


Sub DoFolder(Folder, app As String)

    Dim SubFolder

    For Each SubFolder In Folder.SubFolders

        'DoFolder SubFolder ' Uncomment this for recursive

        Debug.Print SubFolder


        Dim Contents As String

        Dim PDF As String

        Dim ZIP As String

        Dim hFile As Integer

        Dim i As Long, j As Long

        Dim ExtractedZippedDocxFolder, FileNameBin, FileNamePDF, BinFolderPath, FileNameNew, FolderUp As String

        Dim fileIndex As Integer

       

    '    Dim dlgOpen As FileDialog

    '    Set dlgOpen = Application.FileDialog( _

    '    FileDialogType:=msoFileDialogFolderPicker)

    '    With dlgOpen

    '        .AllowMultiSelect = True

    '        .Title = "Select the unzipped docx folder to extract PDF file(s) from"

    '        .InitialFileName = "*.docx"

    '        .Show

    '    End With

    '    ExtractedZippedDocxFolder = dlgOpen.SelectedItems.Item(1)

        

        ExtractedZippedDocxFolder = SubFolder

        

        BinFolderPath = ExtractedZippedDocxFolder + "\" & app & "\embeddings"

        Set objFSO = CreateObject("Scripting.FileSystemObject")

        If objFSO.FolderExists(BinFolderPath) = True Then  ' There are embedded objects

            Set objFolder = objFSO.GetFolder(BinFolderPath)

            fileIndex = 0

           

            

            For Each objFile In objFolder.Files

                If LCase$(Right$(objFile.Name, 4)) = ".bin" Then

                    FileNameIndex = Left$(objFile.Name, Len(objFile.Name) - Len(".bin"))

                    FileNameBin = BinFolderPath + "\" + FileNameIndex + ".bin"

                    FileNamePDF = BinFolderPath + "\" + FileNameIndex + ".pdf"

               

                    hFile = FreeFile

                    Open FileNameBin For Binary Access Read As #hFile

                    Contents = String(LOF(hFile), vbNullChar)

                    Get #hFile, , Contents

                    Close #hFile

               

                    i = InStrB(1, Contents, "%PDF")

                    If i <> 0 Then ' It is a PDF

                         j = InStrB(i, Contents, "%%EOF")

                         If (InStrB(j + 1, Contents, "%%EOF") > 0) Then j = InStrB(j + 1, Contents, "%%EOF")

                    

                         PDF = MidB(Contents, i, j + 5 - i + 12)

                    

                         Open FileNamePDF For Binary Access Write As #hFile

                         Put #hFile, , PDF

                         Close #hFile

                         FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))

                         FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".pdf"

                         Name FileNamePDF As FileNameNew

                    

                         fileIndex = fileIndex + 1

                    Else


                      ' Zip files

                      i = InStrB(1, Contents, "PK" & Chr(3) & Chr(4))

                      If i <> 0 Then ' It is a zip file. Just find the start and go to the end. There will be a little bit of garbage at the end but it'll be ignored by zip programs. It's too complicated to figure out where a ZIP file ends!!

                         ZIP = MidB(Contents, i, Len(Contents))

                         FileNameZIP = BinFolderPath + "\" + FileNameIndex + ".zip"


                         Open FileNameZIP For Binary Access Write As #hFile

                         Put #hFile, , ZIP

                         Close #hFile

                         FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))

                         FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".zip"

                         Name FileNameZIP As FileNameNew

                         fileIndex = fileIndex + 1

                      End If

                   End If

                Else

                   ' Check for docx files

                   If LCase$(Right$(objFile.Name, 5)) = ".docx" Then

                         FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))

                         FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".docx"

                         FileCopy objFile.Path, FileNameNew

                         fileIndex = fileIndex + 1

                   End If

                   ' Check for xlsx files

                   If LCase$(Right$(objFile.Name, 5)) = ".xlsx" Then

                         FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))

                         FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".xlsx"

                         FileCopy objFile.Path, FileNameNew

                         fileIndex = fileIndex + 1

                   End If

                   ' Check for pptx files

                   If LCase$(Right$(objFile.Name, 5)) = ".pptx" Then

                         FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))

                         FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".pptx"

                         FileCopy objFile.Path, FileNameNew

                         fileIndex = fileIndex + 1

                   End If

                   ' Check for vsdx files

                   If LCase$(Right$(objFile.Name, 5)) = ".vsdx" Then

                         FolderUp = Left(ExtractedZippedDocxFolder, InStrRev(ExtractedZippedDocxFolder, "\"))

                         FileNameNew = FolderUp + Right(ExtractedZippedDocxFolder, Len(ExtractedZippedDocxFolder) - InStrRev(ExtractedZippedDocxFolder, "\")) + "." + FileNameIndex + ".vsdx"

                         FileCopy objFile.Path, FileNameNew

                         fileIndex = fileIndex + 1

                   End If

                End If

            Next

            If fileIndex = 0 Then

                Debug.Print "Unable to find any bin file in the givven unzipped docx file content"

            Else

                Debug.Print Str(fileIndex) + "  files were processed"

            End If


        End If


    Next

'    Dim File

'    For Each File In Folder.Files

'    Next

End Sub











Sub export_PDFs()

' This is for a single file only

    Dim Contents As String

    Dim PDF As String

    Dim hFile As Integer

    Dim i As Long, j As Long

    Dim ExtractedZippedDocxFolder, FileNameBin, FileNamePDF, BinFolderPath As String

    Dim fileIndex As Integer

   

    Dim dlgOpen As FileDialog

    Set dlgOpen = Application.FileDialog( _

    FileDialogType:=msoFileDialogFolderPicker)

    With dlgOpen

        .AllowMultiSelect = False

        .Title = "Select the unzipped docx folder to extract PDF file(s) from"

        .InitialFileName = "*.docx"

        .Show

    End With

    ExtractedZippedDocxFolder = dlgOpen.SelectedItems.Item(1)

    BinFolderPath = ExtractedZippedDocxFolder + "\word\embeddings"

    Set objFSO = CreateObject("Scripting.FileSystemObject")

    Set objFolder = objFSO.GetFolder(BinFolderPath)

    fileIndex = 0

   

    For Each objFile In objFolder.Files

        If LCase$(Right$(objFile.Name, 4)) = ".bin" Then

            FileNameIndex = Left$(objFile.Name, Len(objFile.Name) - Len(".bin"))

            FileNameBin = BinFolderPath + "\" + FileNameIndex + ".bin"

            FileNamePDF = BinFolderPath + "\" + FileNameIndex + ".pdf"

       

            hFile = FreeFile

            Open FileNameBin For Binary Access Read As #hFile

            Contents = String(LOF(hFile), vbNullChar)

            Get #hFile, , Contents

            Close #hFile

       

            i = InStrB(1, Contents, "%PDF")

            If i <> 0 Then ' It is a PDF

                 j = InStrB(i, Contents, "%%EOF")

                 If (InStrB(j + 1, Contents, "%%EOF") > 0) Then j = InStrB(j + 1, Contents, "%%EOF")

            

                 PDF = MidB(Contents, i, j + 5 - i + 12)

            

                 Open FileNamePDF For Binary Access Write As #hFile

                 Put #hFile, , PDF

                 Close #hFile

                 fileIndex = fileIndex + 1

            End If

        End If

    Next

    If fileIndex = 0 Then

        MsgBox "Unable to find any bin file in the givven unzipped docx file content"

    Else

        MsgBox Str(fileIndex) + "  files were processed"

    End If

 

End Sub


Other Embedded file formats

For STP files (not sure other files), you can just:

Not sure if this works for other file formats (doesn't seem to work with PDFs).

Below is a macro that does this automatically. Just run the sub ProcessWordDocumentsOle and choose the folder with docx files, no unzipping needed.

Sub CopyOutOLEFiles(doc As Document)

' This is for a single file only

    Dim ole As OLEFormat

    Dim obj As Object

    Dim i As Integer

    Dim shello As Object

    Dim outputFolder As String

    

    outputFolder = doc.Path

    

    

    ' Check each object in the active document

    For Each obj In doc.InlineShapes

        ' Check if the object is an OLE object

        If obj.Type = wdInlineShapeEmbeddedOLEObject Or obj.Type = wdInlineShapeLinkedOLEObject Then

            Set ole = obj.OLEFormat

            ' Try to print the source name (filename)

            On Error Resume Next ' Skip if error occurs

            If Right(ole.IconLabel, 3) = "pdf" Or Right(ole.IconLabel, 4) = "docx" Or Right(ole.IconLabel, 4) = "xlsx" Or Right(ole.IconLabel, 4) = "vsdx" Then

              ' Do nothing

            Else

              Debug.Print ole.IconLabel

              obj.Select

              Selection.Copy

              CreateObject("Shell.Application").Namespace(outputFolder & "\").Self.InvokeVerb "Paste"

              oldFilePath = outputFolder & "\" & ole.IconLabel

              newFilePath = outputFolder & "\" & doc.Name & "_" & ole.IconLabel

              Name oldFilePath As newFilePath ' Sometimes it can't rename it because it is given a different filename than the IconLabel!!

              On Error GoTo 0 ' Stop ignoring errors

            End If


            DoEvents

        End If

        i = i + 1

    Next obj

End Sub



Sub ProcessWordDocumentsOle()

    Dim fd As FileDialog

    Dim selectedFolder As String

    Dim wordFile As String

    Dim doc As Document

    Dim FileSystem As Object


    ' Create a FileDialog object as a Folder Picker dialog box.

    Set fd = Application.FileDialog(msoFileDialogFolderPicker)


    If fd.Show = -1 Then

        selectedFolder = fd.SelectedItems(1) ' Get the selected folder path

        

        ' Make sure the folder path ends with a backslash

        If Right(selectedFolder, 1) <> "\" Then selectedFolder = selectedFolder & "\"


        ' Set the path of the first Word document (.doc or .docx)

        wordFile = Dir(selectedFolder & "*.doc*")


        While wordFile <> ""

            ' Open the Word document

            Set doc = Documents.Open(FileName:=selectedFolder & wordFile)


            ' Call the sample function to process the document

            Call CopyOutOLEFiles(doc)


            ' Save and close the document

            doc.Close SaveChanges:=wdDoNotSaveChanges


            ' Get the next Word document

            wordFile = Dir

        Wend

        

        MsgBox "All Word documents in the selected folder have been processed."

    Else

        MsgBox "No folder was selected."

    End If


    ' Clean up

    Set fd = Nothing

End Sub