Connect & read Datalake from Python

A couple Azure libraries need to be installed first.

#pip install azure-storage-file-datalake

#pip install azure-identity


If using service principal:

from azure.storage.filedatalake import DataLakeServiceClient

from azure.identity import ClientSecretCredential

tenant_id=""

app_id=""

secret=""

storage_account_name = ""

container_name = ''

account_url = 'https://{}.dfs.core.windows.net/'.format(storage_account_name)

credential = ClientSecretCredential(tenant_id, app_id, secret)

service = DataLakeServiceClient(account_url=account_url, credential=credential)

file_system_client = service.get_file_system_client(container_name)

for path in file_system_client.get_paths('/'):

print(path.name)


If using access key:

from azure.storage.filedatalake import DataLakeServiceClient


container_name = ''

account_url = 'https://{}.dfs.core.windows.net/'.format(storage_account_name)

access_key = ''

datalake_service_client = DataLakeServiceClient(account_url=account_url, credential=access_key)


file_systems = datalake_service_client.list_file_systems()


for file_system in file_systems:

    print(file_system.name)



Note different service principal / access keys may have different access to the data lake. So some of the operations may not work, e.g. list file systems( i.e. containers) , but other ops, e.g. get paths for a specific container may work.


To download files from a subdirectory using access key

Connect to the file system and list all files under a directory. Then download each of the files into one file on the local. Go from there, e.g. load file to pandas, etc.

#https://docs.microsoft.com/en-us/azure/architecture/data-science-process/explore-data-blob

from azure.storage.filedatalake import DataLakeServiceClient


endpoint = 'https://mydatalake001.dfs.core.windows.net/'

access_key = 'xxxxxxx'

service_client = DataLakeServiceClient(account_url=endpoint, credential=access_key)


container = 'mycontianer'

site = 'xxx'

partition = '2022-07-26'

path=f"osipi/archive/{site}/csv/{partition}"

local_temp_file = 'c:\\temp\\download.csv'


try:    

    file_system_client = service_client.get_file_system_client(file_system=container )

    directory_client = file_system_client.get_directory_client(path)

    paths = file_system_client.get_paths(path) #get all files

    with open(local_temp_file, 'wb') as local_file:

        for path in paths:

            #print('downloading {}'.format(path.name))          

            file_client = directory_client.get_file_client(path)  

            download = file_client.download_file()

            downloaded_bytes = download.readall()

            local_file.write(downloaded_bytes)

except Exception as e:

    print(e)


To download files from subdirectory using service principal

#pip install azure-storage-file-datalake

#pip install azure-identity

from azure.storage.filedatalake import DataLakeServiceClient

from azure.identity import ClientSecretCredential

import pandas as pd

tenant_id="xxx"

app_id="xxxe"

secret="xxx"

storage_account_name = "xxx"

account_url = 'https://{}.dfs.core.windows.net/'.format(storage_account_name)


credential = ClientSecretCredential(tenant_id, app_id, secret)

service = DataLakeServiceClient(account_url=account_url, credential=credential)


container_name = 'containername'

path = 'dir1/dir2'

local_temp_file = 'c:\\temp\\download.csv'


try:    

    file_system_client = service.get_file_system_client(container_name)

    directory_client = file_system_client.get_directory_client(path)

    paths = file_system_client.get_paths(path) #get all files

    

    with open(local_temp_file, 'wb') as local_file:

        for path in paths:

            print('downloading {}'.format(path.name))       

            file_client = directory_client.get_file_client(path)  

            download = file_client.download_file()

            downloaded_bytes = download.readall()

            local_file.write(downloaded_bytes)

            

        df = pd.read_csv(local_temp_file)

        df.head()

except Exception as e:

    print(e)