Post date: Oct 24, 2015 4:41:27 AM
We want to convert key-value file into a flat format file. Each key-value record is in the format of key:value and the key-value records are separated by a delimiter ','. Each line of the file will be map to one record in the output and will be rearranged according to the column sequence defined by the user. The code still works even when the number of records in each row not equal or the order of the keys are different by each row. For example,
Input file: test_example.csv
user:123,age:23,sex:F,city:Bostonuser:562,age:45,sex:M,state:WAuser:7843,sex:M,state:FL,city:Miamiuser:507,state:WA,age:45,city:Seattle,sex:Mcity:Woodinville,sex:F,user:95402,state:,age:15desired output
user,age,sex,state,city123,23,F,None,Boston562,45,M,WA,None7843,None,M,FL,Miami507,45,M,WA,Seattle95402,15,F,None,WoodinvilleFollow steps here:
test_example.csv using this command:sed 's/\,/\n/g' test_example.csv | perl -pe 's/:(.*?)$//g' | sort | uniq  > test_example_headeragecitysexstateuservi to edit the header variables and here is the edited test_example_header. This header file will be used in the next step to populate the variables from the original input file with the same order as determined by the user.agesexstatecitypython3.3 create_flat_file.py -h test_example_header -c test_example.csv -o output -b 10000 -d , -k : -m 'None'#!/apollo/env/MLEnvImprovement/bin/python3.3  import sys, getopt  def create_header_list(path_to_header):     """     This is a helper function to convert a header file (one variable for each row) to a list of header.     """     # make header list     var_list = []     for line in open(path_to_header):         var = line.strip()         print("var: %s, len: %d" % (var, len(var)) )         if len(var) > 0:             var_list.append(var)     # display the variable list     print(var_list)          return var_list           def convert_to_flat(path_to_header, path_to_content, path_to_output, buffer_size=10000, delim_col=',', delim_key=':', missing_fill='None'):     """     This program converts key:value file into csv flat file.      Given the header (path_to_header), the program read and convert the content file (path_to_content) line by line.     I ran the program on a 51GB (27M rows x >30 columns) and it took 2 hours.          Example:          Input:      user:123,age:23,sex:F,city:Boston     user:562,age:45,sex:M,state:WA     user:7843,sex:M,state:FL,city:Miami     [intentionally blanked]     user:507,state:WA,age:45,city:Seattle,sex:M     city:Woodinville,sex:F,user:95402,state:,age:15     [intentionally blanked]     [intentionally blanked]          Header output:     user,age,sex,state,city          Content output:      123,23,F,None,Boston     562,45,M,WA,None     7843,None,M,FL,Miami     507,45,M,WA,Seattle     95402,15,F,None,Woodinville          kittipat@gmail.com     2015-10-23     """     # create header list     var_list = create_header_list(path_to_header)          # output the header     header_file = open(path_to_output + '.sig', 'a')     header_file.write(','.join(var_list))       header_file.close()          # open the output file     out_file = open(path_to_output, 'a')     str_buffer = ''      # initialize dict     d = {v: None for v in var_list}      # For loop to fill each variable, read file line by line     cnt = 1     for line in open(path_to_content):                  # reset hash values         d = d.fromkeys(d, None)                  # loop to hash key-value         if (len(line) > 0) & (delim_key in line):                          # put into hash             for chunk in line.split(delim_col):                 s = chunk.split(delim_key)                 d[s[0]] = s[1].strip()                          # loop to write each variable into a line             my_line = []             for var in var_list:                 if d[var]:                     my_line.append(d[var])                 else:                     my_line.append(missing_fill)              # write to string buffer             str_buffer = str_buffer + ','.join(my_line) + '\n'                          # write to a file when buffer is full                if ( cnt % buffer_size) == 0:                 print('@line: %d' % cnt)                 out_file.write(str_buffer)                 str_buffer = '' # reset the string buffer                          #else: print("Warning 'Invalid line' -- line: %d, len: %d, content: \"%s\"" % (cnt, len(line), line))          # increment         cnt = cnt + 1      # if buffer is not written yet     if len(str_buffer) > 0:         out_file.write(str_buffer)            # close the file handle     out_file.close()           return 1       def main(argv):     # Set default values     path_to_header = None     path_to_content = None     path_to_output = None     buffer_size = 10000     delim_col = ','     delim_key = ':'     missing_fill = 'None'     try:         opts, args = getopt.getopt(argv,"h:c:o:b:d:k:m:",["hfile=","cfile=","ofile=","buffer=","delim_col=","delim_key=","miss_fill="])     except getopt.GetoptError:         print('create_flat_file.py -h <header_file> -c <content_file> -o <output_file> -b <buffer_size> -d <delim_col> -k <delim_key> -m <miss_fill>')         sys.exit(2)              # loop to receive value from arg         for opt, arg in opts:         if opt in ("-h", "--hfile"):             path_to_header = arg         elif opt in ("-c", "--cfile"):             path_to_content = arg         elif opt in ("-o", "--ofile"):             path_to_output = arg         elif opt in ("-b", "--buffer"):             buffer_size = int(arg)         elif opt in ("-d", "--delim_col"):             delim_col = arg         elif opt in ("-k", "--delim_key"):             delim_key = arg         elif opt in ("-m", "--miss_fill"):             missing_fill = arg            # display the argument values     print('hfile: %s' % path_to_header)     print('cfile: %s' % path_to_content)     print('ofile: %s' % path_to_output)     print('buffer: %s' % buffer_size)     print('delim_col: %s' % delim_col)     print('delim_key: %s' % delim_key)     print('miss_fill: %s' % missing_fill)          # create the flat file     convert_to_flat(path_to_header                     , path_to_content                     , path_to_output                     , buffer_size                     , delim_col                     , delim_key                     , missing_fill)   if __name__ == "__main__":     main(sys.argv[1:])