Post date: Feb 17, 2016 7:16:20 AM
Here is how we perform set operations like union, intersect, size and Jaccard similarity:
-- Add the UDF jar fileadd jar hdfs:///hiveudfs/brickhouse.jar;-- Drop existing functions if exist, so that no conflictsdrop function if exists combine_unique;drop function if exists array_intersect;drop function if exists combine;-- If you want the functions loaded temporary in this session onlyCREATE TEMPORARY FUNCTION combine AS 'brickhouse.udf.collect.CombineUDF';CREATE TEMPORARY FUNCTION combine_unique AS 'brickhouse.udf.collect.CombineUniqueUDAF';CREATE TEMPORARY FUNCTION array_intersect AS "brickhouse.udf.collect.ArrayIntersectUDF";-- Or you can make it permanentlyCREATE FUNCTION combine AS 'brickhouse.udf.collect.CombineUDF';CREATE FUNCTION combine_unique AS 'brickhouse.udf.collect.CombineUniqueUDAF';CREATE FUNCTION array_intersect AS "brickhouse.udf.collect.ArrayIntersectUDF";-- This is my test queryset hive.cli.print.header=true;select combine(array('a','b','c'), array('b','c','d')) as combine, combine_unique(combine(array('a','b','c'), array('b','c','d'))) as union, array_intersect(array('a','b','c'), array('b','c','d')) as intersect, size(array_intersect(array('a','b','c'), array('b','c','d')))/size(combine_unique(combine(array('a','b','c'), array('b','c','d')))) as jaccard_similarity;Further resources:
http://www.sohuaz.xyz/questions/3910217/array-intersect-hive
http://stackoverflow.com/questions/20043448/how-to-add-a-permanent-function-in-hive