我在阿里云OSS上创建了一张Hudi COW表并注册到Hive 3.1.2 MetaStore,使用Hive查询没有问题。安装StarRocks 2.2.0,按照官网说明去创建Hudi外表,创建的过程中报错如下,这是因为hadoop版本原因么?我用的是开源 hadoop 2.10.1。
mysql> CREATE EXTERNAL TABLE demo_trips_cow
(
-> begin_lat
double NULL,
-> begin_lon
double NULL,
-> driver
varchar(200) NULL,
-> end_lat
double NULL,
-> end_lon
double NULL,
-> fare
double NULL,
-> partitionpath
varchar(200) NULL,
-> rider
varchar(200) NULL,
-> ts
bigint NULL,
-> uuid
varchar(200) NULL,
-> continent
varchar(200) NULL,
-> country
varchar(200) NULL,
-> city
varchar(200) NULL
-> ) ENGINE=HUDI
-> PROPERTIES (
-> “resource” = “hudi0”,
-> “database” = “default”,
-> “table” = “demo_trips_cow”
-> );
ERROR 1064 (HY000): Unexpected exception: Failed to get instance of org.apache.hadoop.fs.FileSystem
之后改用通过hive外表方式查询这张hudi表也会报错 ERROR 1064 (HY000): com.starrocks.common.DdlException: get partition detail failed: com.starrocks.common.DdlException: get hive partition meta data failed: unsupported file format [org.apache.hudi.hadoop.HoodieParquetInputFormat]
而后我又试了用SR查询另一张普通Hive表(数据文件在HDFS上)是没有问题的。
在Hive上查看Hudi表的建表语句如下
CREATE EXTERNAL TABLE demo_trips_cow
(
_hoodie_commit_time
string,
_hoodie_commit_seqno
string,
_hoodie_record_key
string,
_hoodie_partition_path
string,
_hoodie_file_name
string,
begin_lat
double,
begin_lon
double,
driver
string,
end_lat
double,
end_lon
double,
fare
double,
partitionpath
string,
rider
string,
ts
bigint,
uuid
string)
PARTITIONED BY (
continent
string,
country
string,
city
string)
ROW FORMAT SERDE
‘org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe’
WITH SERDEPROPERTIES (
‘hoodie.query.as.ro.table’=‘false’,
‘path’=‘oss://datalake-huifu/hudi/demo_trips_cow’)
STORED AS INPUTFORMAT
‘org.apache.hudi.hadoop.HoodieParquetInputFormat’
OUTPUTFORMAT
‘org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat’
LOCATION
‘oss://datalake-huifu/hudi/demo_trips_cow’
TBLPROPERTIES (
‘bucketing_version’=‘2’,
‘last_commit_time_sync’=‘20220519161739696’,
‘spark.sql.create.version’=‘3.2.1’,
‘spark.sql.sources.provider’=‘hudi’,
‘spark.sql.sources.schema.numPartCols’=‘3’,
‘spark.sql.sources.schema.numParts’=‘1’,
‘spark.sql.sources.schema.part.0’=’{“type”:“struct”,“fields”:[{“name”:"_hoodie_commit_time",“type”:“string”,“nullable”:true,“metadata”:{}},{“name”:"_hoodie_commit_seqno",“type”:“string”,“nullable”:true,“metadata”:{}},{“name”:"_hoodie_record_key",“type”:“string”,“nullable”:true,“metadata”:{}},{“name”:"_hoodie_partition_path",“type”:“string”,“nullable”:true,“metadata”:{}},{“name”:"_hoodie_file_name",“type”:“string”,“nullable”:true,“metadata”:{}},{“name”:“begin_lat”,“type”:“double”,“nullable”:true,“metadata”:{}},{“name”:“begin_lon”,“type”:“double”,“nullable”:true,“metadata”:{}},{“name”:“driver”,“type”:“string”,“nullable”:true,“metadata”:{}},{“name”:“end_lat”,“type”:“double”,“nullable”:true,“metadata”:{}},{“name”:“end_lon”,“type”:“double”,“nullable”:true,“metadata”:{}},{“name”:“fare”,“type”:“double”,“nullable”:true,“metadata”:{}},{“name”:“partitionpath”,“type”:“string”,“nullable”:true,“metadata”:{}},{“name”:“rider”,“type”:“string”,“nullable”:true,“metadata”:{}},{“name”:“ts”,“type”:“long”,“nullable”:true,“metadata”:{}},{“name”:“uuid”,“type”:“string”,“nullable”:true,“metadata”:{}},{“name”:“continent”,“type”:“string”,“nullable”:false,“metadata”:{}},{“name”:“country”,“type”:“string”,“nullable”:false,“metadata”:{}},{“name”:“city”,“type”:“string”,“nullable”:false,“metadata”:{}}]}’,
‘spark.sql.sources.schema.partCol.0’=‘continent’,
‘spark.sql.sources.schema.partCol.1’=‘country’,
‘spark.sql.sources.schema.partCol.2’=‘city’,
‘transient_lastDdlTime’=‘1652948282’);