本文將介紹向量檢索版兩種二進制數據召回結果的處理方式(protobuf和flatbuffers)。
protobuf格式
Maven依賴
<properties>
<grpc.version>1.6.1</grpc.version>
<protobuf.version>3.21.5</protobuf.version>
</properties>
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>${protobuf.version}</version>
</dependency>
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java-util</artifactId>
<version>${protobuf.version}</version>
</dependency>
<dependency>
<groupId>io.grpc</groupId>
<artifactId>grpc-netty</artifactId>
<version>${grpc.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>io.grpc</groupId>
<artifactId>grpc-protobuf</artifactId>
<version>${grpc.version}</version>
<scope>provided</scope>
</dependency>
生成protobuf文件
將Maven依賴引入項目中后,還需生成protobuf相應的文件,才能正常將向量檢索版實例召回的結果解析成protobuf格式,生成protobuf文件步驟如下:
安裝protobuf環境,推薦proto版本 【3.21.5】,可以通過命令
protoc --version
查看版本在項目中定義描述文件,文件后綴為【.proto】,可根據需求自行更改,案例如下:
【Ha3ResultProto.proto】
syntax = "proto2";
package com.searchengine.example.demo.protobuf;
option cc_enable_arenas = true;
message PBAttrKVPair {
optional string key = 1;
repeated int64 int64Value = 2;
repeated double doubleValue = 3;
repeated bytes bytesValue = 4;
}
enum ValueType {
ATTRIBUTE_TYPE = 0;
VARIABLE_VALUE_TYPE = 1;
}
message PBInt64Attribute {
optional string key = 1;
optional ValueType type = 2;
repeated int64 int64Value = 3;
repeated uint32 offset = 4;
}
message PBDoubleAttribute {
optional string key = 1;
optional ValueType type = 2;
repeated double doubleValue = 3;
repeated uint32 offset = 4;
}
message PBBytesAttribute {
optional string key = 1;
optional ValueType type = 2;
repeated bytes bytesValue = 3;
repeated uint32 offset = 4;
}
message SortExprssionMeta {
optional bool sortFlag = 1;
optional string sortExprName = 2;
}
message PBSortValues {
optional uint32 dimensionCount = 1;
repeated double sortValues = 2;
repeated SortExprssionMeta sortExprMetas = 3;
}
message PBKVPair {
optional string key = 1;
optional bytes value = 2;
}
message PBResult
{
optional uint64 totalTime = 1;
optional PBHits hits = 2;
repeated PBAggregateResults aggResults = 3;
repeated PBErrorResult errorResults = 4;
optional bytes tracer = 5;
optional bool fromCache = 6;
optional PBMatchDocs matchDocs = 7;
repeated PBMetaMap metaMap = 8;
}
message PBMatchDocs
{
optional uint32 numMatchDocs = 1;
optional uint32 totalMatchDocs = 2;
repeated string clusterNames = 3;
repeated uint32 clusterIds = 4;
repeated uint32 hashids = 5;
repeated uint32 docids = 6;
repeated int32 fullIndexVersions = 7;
repeated int32 indexVersions = 8;
repeated uint64 pkHighers = 9;
repeated uint64 pkLowers = 10;
repeated uint32 searcherIps = 11;
repeated bytes tracers = 12;
repeated PBInt64Attribute int64AttrValues = 13;
repeated PBDoubleAttribute doubleAttrValues = 14;
repeated PBBytesAttribute bytesAttrValues = 15;
optional PBSortValues sortValues = 16;
}
message PBHits
{
optional uint32 numhits = 1;
optional uint32 totalHits = 2;
repeated PBHit hit = 3;
repeated PBMetaHitMap metaHitMap = 4;
optional double coveredPercent = 5;
repeated SortExprssionMeta sortExprMetas = 6;
}
message PBHit {
optional string clusterName = 1;
optional uint32 hashid = 2;
optional uint32 docid = 3;
optional int32 fullIndexVersion = 4;
optional int32 indexVersion = 5;
optional uint64 pkHigher = 6;
optional uint64 pkLower = 7;
repeated PBAttrKVPair attributes = 8;
repeated PBAttrKVPair variableValues = 9;
repeated PBKVPair summary = 10;
repeated PBKVPair property = 11;
repeated string sortValues = 12;
optional bytes tracer = 13;
optional uint32 searcherIp = 14;
optional string rawPk = 15;
optional bytes summaryBytes = 16;
}
message PBMetaHitMap
{
optional string metaHitKey = 1;
repeated PBKVPair metaHitValue = 2;
}
message PBAggregateResults
{
optional string aggregateKey = 1;
repeated PBAggregateValue aggregateValue = 2;
}
message PBAggregateValue
{
optional string groupValue = 1;
repeated PBKVPair funNameResultPair = 2;
}
message PBErrorResult
{
optional string partitionId = 1;
optional string hostName = 2;
optional uint32 errorCode = 3;
optional string errorDescription = 4;
}
message PBMetaMap
{
optional string metaKey = 1;
repeated PBKVPair metaValue = 2;
}
在對應protobuf描述文件目錄下執行命令
protoc --java_out=./ Ha3ResultProto.proto
生成Java文件路徑可以在描述文件中package指定,例如:
package com.aliyun.demo.protobuf
執行命令后,會自動在指定的package中生成對應的Java文件,以上述為例,會在
com.aliyun.demo.protobuf
包下生成一個Ha3ResultProto.java
的文件,在通過SDK解析向量檢索版實例的召回結果時,可直接引用:
import com.aliyun.ha3engine.Client;
import com.aliyun.ha3engine.models.*;
import com.aliyun.tea.TeaException;
import com.aliyun.demo.protobuf.Ha3ResultProto;
import org.junit.Before;
import org.junit.Test;
import java.nio.ByteBuffer;
import java.util.*;
public class DataFormatService {
/**
* 向量檢索版client,暫時支持查詢操作
*/
private Client client;
@Before
public void clientInit() throws Exception {
/*
初始化向量檢索版client
*/
Config config = new Config();
// API域名,可在實例詳情頁>API入口 查看
config.setEndpoint("");
// 實例名稱,可在實例詳情頁左上角查看,例:ha-cn-i7*****605
config.setInstanceId("");
// 用戶名,可在實例詳情頁>網絡信息 查看
config.setAccessUserName("");
// 密碼,可在實例詳情頁>網絡信息 修改
config.setAccessPassWord("");
//公網調用填寫httpProxy
config.setHttpProxy("");
client = new Client(config);
}
@Test
public void protobufFormat() throws Exception {
try {
/*
示例: 使用 ha查詢串進行搜索.
*/
SearchRequestModel haQueryRequestModel = new SearchRequestModel();
SearchQuery haRawQuery = new SearchQuery();
haRawQuery.setQuery("query=id:8148508889615505646&&config=start:0,hit:100,format:protobuf&&cluster=general");
haQueryRequestModel.setQuery(haRawQuery);
SearchBytesResponseModel haSearchBytesResponseModel = client.SearchBytes(haQueryRequestModel);
System.out.println("ha查詢串搜索結果:\n" + Arrays.toString(haSearchBytesResponseModel.getBody()));
//轉換為protobuf格式
Ha3ResultProto.PBResult pbResult = Ha3ResultProto.PBResult.parseFrom(haSearchBytesResponseModel.getBody());
System.out.println("protobuf格式輸出結果:\n" + pbResult);
} catch (TeaException e) {
System.out.println(e.getCode());
System.out.println(e.getMessage());
Map<String, Object> abc = e.getData();
System.out.println(com.aliyun.teautil.Common.toJSONString(abc));
}
}
}
注意事項
protobuf格式只適用于ha查詢方式下使用,使用時需定義format:protobuf,如不需要protobuf格式,可使用普通json,定義format:json即可
client提供了Search和SearchBytes兩種查詢方式,Search方式返回body為String格式數據,SearchBytes返回body為byte[]格式數據,因此SearchBytes查詢方法只能在aliyun-sdk-ha3engine1.3.2版本中使用
protobuf轉換必須使用
aliyun-sdk-ha3engin1.3.2
版本
flatbuffers格式
Maven依賴
<properties>
<flatbuffers.java.version>2.0.7</flatbuffers.java.version>
</properties>
<dependency>
<groupId>com.google.flatbuffers</groupId>
<artifactId>flatbuffers-java</artifactId>
<version>${flatbuffers.java.version}</version>
</dependency>
生成flatbuffers文件
將Maven依賴引入項目中后,還需生成flatbuffers相應的文件,才能正常將向量檢索版實例召回的結果解析成flatbuffers格式,生成flatbuffers文件步驟如下:
安裝flatbuffers環境,推薦flatbuffers版本 【2.0.7】,可以通過命令
flatc --version
查看版本在項目中定義描述文件,文件后綴為【.fbs】,可以根據需求自行更改,案例如下:
【SqlResult.fbs】
include "TwoDimTable.fbs";
namespace com.searchengine.example.demo.protobuf;
table SqlErrorResult {
partitionId: string (id:0);
hostName: string (id:1);
errorCode: uint (id:2);
errorDescription: string (id:3);
}
table SqlResult {
processTime: double (id:0);
rowCount: uint32 (id:1);
errorResult: SqlErrorResult (id:2);
sqlTable: TwoDimTable (id:3);
searchInfo: string (id:4);
}
root_type SqlResult;
【TwoDimTable.fbs】
namespace com.searchengine.example.demo.protobuf;
// multi value
table MultiInt8 { value: [byte]; }
table MultiInt16 { value: [short]; }
table MultiInt32 { value: [int]; }
table MultiInt64 { value: [long]; }
table MultiUInt8 { value: [ubyte]; }
table MultiUInt16 { value: [ushort]; }
table MultiUInt32 { value: [uint]; }
table MultiUInt64 { value: [ulong]; }
table MultiFloat { value: [float]; }
table MultiDouble { value: [double]; }
table MultiString { value: [string]; }
// column base storage
table Int8Column { value: [byte]; }
table Int16Column { value: [short]; }
table Int32Column { value: [int]; }
table Int64Column { value: [long]; }
table UInt8Column { value: [ubyte]; }
table UInt16Column { value: [ushort]; }
table UInt32Column { value: [uint]; }
table UInt64Column { value: [ulong]; }
table FloatColumn { value: [float]; }
table DoubleColumn { value: [double]; }
table StringColumn { value: [string]; }
table MultiInt8Column { value: [MultiInt8]; }
table MultiUInt8Column { value: [MultiUInt8]; }
table MultiInt16Column { value: [MultiInt16]; }
table MultiUInt16Column { value: [MultiUInt16]; }
table MultiInt32Column { value: [MultiInt32]; }
table MultiUInt32Column { value: [MultiUInt32]; }
table MultiInt64Column { value: [MultiInt64]; }
table MultiUInt64Column { value: [MultiUInt64]; }
table MultiFloatColumn { value: [MultiFloat]; }
table MultiDoubleColumn { value: [MultiDouble]; }
table MultiStringColumn { value: [MultiString]; }
// column type
union ColumnType {
Int8Column,
Int16Column,
Int32Column,
Int64Column,
UInt8Column,
UInt16Column,
UInt32Column,
UInt64Column,
FloatColumn,
DoubleColumn,
StringColumn,
MultiInt8Column,
MultiInt16Column,
MultiInt32Column,
MultiInt64Column,
MultiUInt8Column,
MultiUInt16Column,
MultiUInt32Column,
MultiUInt64Column,
MultiFloatColumn,
MultiDoubleColumn,
MultiStringColumn,
}
table Column {
name: string;
value: ColumnType;
}
table TwoDimTable {
rowCount: uint (id:0);
columns: [Column] (id:1);
}
在對應flatbuffers描述文件目錄下執行命令
flatc --java_out=./ SqlResult.fbs
生成Java文件路徑可以在描述文件中【namespace】指定,例如:
namespace com.aliyun.demo.flatbuffers
執行命令后,會自動在指定的package中生成對應的Java文件,以上述為例,會在
com.aliyun.demo.flatbuffers
包下生成一個若干.java
的文件,在通過SDK解析向量檢索版實例的召回結果時,可直接引用:
import com.aliyun.ha3engine.Client;
import com.aliyun.ha3engine.models.*;
import com.aliyun.tea.TeaException;
import com.aliyun.demo.flatbuffers.Int64Column;
import com.aliyun.demo.flatbuffers.SqlResult;
import org.junit.Before;
import org.junit.Test;
import java.nio.ByteBuffer;
import java.util.*;
public class DataFormatService {
/**
* 向量檢索版client,暫時支持查詢操作
*/
private Client client;
@Before
public void clientInit() throws Exception {
/*
初始化向量檢索版client
*/
Config config = new Config();
// API域名,可在實例詳情頁>API入口 查看
config.setEndpoint("");
// 實例名稱,可在實例詳情頁左上角查看,例:ha-cn-i7*****605
config.setInstanceId("");
// 用戶名,可在實例詳情頁>網絡信息 查看
config.setAccessUserName("");
// 密碼,可在實例詳情頁>網絡信息 修改
config.setAccessPassWord("");
//公網調用填寫httpProxy
config.setHttpProxy("");
client = new Client(config);
}
@Test
public void flatBuffersFormat() throws Exception {
try {
/*
示例 : 使用 sql 查詢串進行搜索
*/
SearchRequestModel sqlQueryRRequestModel = new SearchRequestModel();
SearchQuery SqlRawQuery = new SearchQuery();
SqlRawQuery.setSql("query=select * from indexTableName&&kvpair=trace:INFO;format:flatbuffers");
sqlQueryRRequestModel.setQuery(SqlRawQuery);
SearchBytesResponseModel sqlSearchBytesResponseModel = client.SearchBytes(sqlQueryRRequestModel);
System.out.println("sql 查詢串搜索結果:\n" + Arrays.toString(sqlSearchBytesResponseModel.getBody()));
//轉換為flatBuffers格式
SqlResult sqlResult = SqlResult.getRootAsSqlResult(ByteBuffer.wrap(sqlSearchBytesResponseModel.getBody()));
/*
指定返回字段,例如返回數據共三個字段id、content和url,需要根據字段類型返回Column類型
此處獲取第一個字段id對應的Column,id的類型為int64,所以使用Int64Column接收
若返回字段的類型為String,則使用StringColumn,以此類推
*/
Int64Column int64Column = (Int64Column) sqlResult.sqlTable().columns(0).value(new Int64Column());
// 獲取字段的名稱,例如返回數據共三個字段id、content和url依次返回,則返回name為id
String name = sqlResult.sqlTable().columns(0).name();
System.out.println("字段名稱=" + name);
// 獲取字段對應的數據條數
int total = int64Column.valueLength();
System.out.println(name + "字段數據條數=" + total);
// 遍歷數據
if (total != 0) {
for (int i = 0; i < total; i++) {
// 獲取字段的value,例如字段id對應的數據有n條,此處可獲取到id對應的第n條數據
long value = int64Column.value(i);
System.out.println(name + "字段第" + (i+1) + "條數據=" + value);
}
}
} catch (TeaException e) {
System.out.println(e.getCode());
System.out.println(e.getMessage());
Map<String, Object> abc = e.getData();
System.out.println(com.aliyun.teautil.Common.toJSONString(abc));
}
}
}
注意事項
flatbuffers格式只適用于sql查詢方式下使用,使用時需定義format:flatbuffers,如不需要flatbuffers格式,可使用普通json,定義format:json即可
flatbuffers轉換必須使用
aliyun-sdk-ha3engin1.3.2
版本