DashVector數(shù)據(jù)類型定義
更新時(shí)間:
本文介紹向量檢索服務(wù)DashVector的數(shù)據(jù)類型定義。
Doc
@dataclass(frozen=True)
class Doc(object):
id: str # 主鍵
vector: Union[List[int], List[float], np.ndarray] # 向量數(shù)據(jù)
vectors: Optional[Dict[str, VectorValueType]] = None # 多向量數(shù)據(jù)
sparse_vector: Optional[Dict[int, float]] = None # 稀疏向量數(shù)據(jù)
fields: Optional[FieldDataType] = None # Doc自定義字段
score: float = 0.0 # 向量相似度
@Data
@Builder
public class Doc {
// 主鍵
@NonNull private String id;
// 向量數(shù)據(jù)
@NonNull private Vector vector;
// 稀疏向量數(shù)據(jù)
private TreeMap<Integer, Float> sparseVector;
// 文檔自定義字段
@Builder.Default private Map<String, Object> fields = new HashMap<>();
// 向量相似度
private float score;
// 多向量數(shù)據(jù)
@Singular
private Map<String, Vector> vectors;
public void addField(String key, String value) {
this.fields.put(key, value);
}
public void addField(String key, Integer value) {
this.fields.put(key, value);
}
public void addField(String key, Float value) {
this.fields.put(key, value);
}
public void addField(String key, Boolean value) {
this.fields.put(key, value);
}
}
DocOpResult
@dataclass(frozen=True)
class DocOpResult(object):
doc_op: DocOp
id: str
code: int
message: str
@Getter
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class DocOpResult implements Serializable {
@JsonProperty("doc_op")
private com.aliyun.dashvector.proto.DocOpResult.DocOp docOp;
private String id;
private int code;
private String message;
public DocOpResult(com.aliyun.dashvector.proto.DocOpResult docOpResult) {
this.docOp = docOpResult.getDocOp();
this.id = docOpResult.getId();
this.code = docOpResult.getCode();
this.message = docOpResult.getMessage();
}
}
CollectionMeta
@dataclass(frozen=True)
class CollectionMeta(object):
name: str # Collection名稱
dimension: int # 向量維度
dtype: str # 向量數(shù)據(jù)類型,F(xiàn)LOAT/INT
metric: str # 距離度量方式,euclidean/dotproduct/cosine
status: Status # Collection狀態(tài)
fields: Dict[str, str] # Collection Fields定義,字典value可選值: FLOAT/BOOL/INT/STRING
partitions: Dict[str, Status] # Collection 分區(qū)信息
@Getter
public class CollectionMeta {
// Collection名稱
private final String name;
// 向量維度
private final int dimension;
// 向量數(shù)據(jù)類型,F(xiàn)LOAT/INT
private final CollectionInfo.DataType dataType;
// 距離度量方式,euclidean/dotproduct/cosine
private final CollectionInfo.Metric metric;
// Collection狀態(tài)
private final String status;
// Collection Fields定義,字典value可選值: FLOAT/BOOL/INT/STRIN
private final Map<String, FieldType> fieldsSchema;
// Collection 分區(qū)信息
private final Map<String, Status> partitionStatus;
public CollectionMeta(CollectionInfo collectionInfo) {
this.name = collectionInfo.getName();
this.dimension = collectionInfo.getDimension();
this.dataType = collectionInfo.getDtype();
this.metric = collectionInfo.getMetric();
this.status = collectionInfo.getStatus().name();
this.fieldsSchema = collectionInfo.getFieldsSchemaMap();
this.partitionStatus = collectionInfo.getPartitionsMap();
}
}
CollectionStats
@dataclass(frozen=True)
class CollectionStats(object):
total_doc_count: int # Collection 插入數(shù)據(jù)總量
index_completeness: float # Collection 插入數(shù)據(jù)完成度
partitions: Dict[str, PartitionStats] # Collection 分區(qū)信息
@Getter
public class CollectionStats {
// Collection 插入數(shù)據(jù)總數(shù)
private final long totalDocCount;
// Collection 插入數(shù)據(jù)完成度
private final float indexCompleteness;
// Collection 分區(qū)信息
private final Map<String, PartitionStats> partitions;
public CollectionStats(StatsCollectionResponse.CollectionStats collectionStats) {
this.totalDocCount = collectionStats.getTotalDocCount();
this.indexCompleteness = collectionStats.getIndexCompleteness();
this.partitions = new HashMap<>();
collectionStats
.getPartitionsMap()
.forEach((key, value) -> this.partitions.put(key, new PartitionStats(value)));
}
}
PartitionStats
@dataclass(frozen=True)
class PartitionStats(object):
total_doc_count: int # Partition 分區(qū)內(nèi)數(shù)據(jù)總量
@Getter
public class PartitionStats {
// Partition 分區(qū)內(nèi)數(shù)據(jù)總量
private final long totalDocCount;
public PartitionStats(com.aliyun.dashvector.proto.PartitionStats partitionStats) {
this.totalDocCount = partitionStats.getTotalDocCount();
}
}
Status
class Status(IntEnum):
INITIALIZED = 0 # Collection/Partition 創(chuàng)建中
SERVING = 1 # Collection/Partition 服務(wù)中
DROPPING = 2 # Collection/Partition 刪除中
ERROR = 3 # Collection/Partition 狀態(tài)異常
Group
@dataclass(frozen=True)
class Group(object):
group_id: str # 分組標(biāo)識
docs: List[Doc] # 分組下的文檔列表
@Getter
@Builder
public class Group {
// 分組標(biāo)識
@NonNull private String groupId;
// 分組下的文檔列表
@Singular private List<Doc> docs;
}
RequestUsage
# read_units 和 write_units 是 oneof 關(guān)系
class RequestUsage(object):
read_units: int # 讀請求單元數(shù)
write_units: int # 寫請求單元數(shù)
@Data
@Builder
@JsonInclude(JsonInclude.Include.NON_DEFAULT)
public class RequestUsage {
// 讀請求單元數(shù)
private int readUnits;
// 寫請求單元數(shù)
private int writeUnits;
}
VectorParam
class VectorParam(object):
dimension: int # 向量維度
dtype: Union[Type[int] Type[float]] = float, # 數(shù)據(jù)類型
metric: str = "cosine" # 距離度量方式
quantize_type: str = "" # 量化類型,參見 http://m.bestwisewords.com/document_detail/2663745.html
@Builder
@Getter
public class VectorParam {
/** 向量維度 */
private int dimension;
/** 數(shù)據(jù)類型 */
@Builder.Default @NonNull
private CollectionInfo.DataType dataType = CollectionInfo.DataType.FLOAT;
/** 度量方式 */
@Builder.Default @NonNull
private CollectionInfo.Metric metric = CollectionInfo.Metric.cosine;
/** 量化類型,參見 http://m.bestwisewords.com/document_detail/2663745.html */
@Builder.Default
private String quantizeType="";
}
VectorQuery
class VectorQuery(object):
vector: VectorValueType # 向量數(shù)據(jù)
num_candidates: int = 0 # 候選集個(gè)數(shù),默認(rèn)為query參數(shù)中的topk
is_linear: bool = False # 是否做線性(暴力)檢索
ef: int = 0 # HNSW檢索時(shí)的ef
radius: float = 0.0 # RNN檢索的半徑
@Builder
@Getter
public class VectorQuery {
/** 向量數(shù)據(jù) */
private Vector vector;
/** 候選集個(gè)數(shù),默認(rèn)為query參數(shù)中的topk */
private int numCandidates = 0;
/** 是否做線性(暴力)檢索 */
private boolean linear = false;
/** HNSW檢索時(shí)的ef */
private int ef = 0;
/** RNN檢索的半徑 */
private float radius = 0.0F;
}
BaseRanker
融合排序的基類。
class BaseRanker:
pass
public interface Ranker {
com.aliyun.dashvector.proto.Ranker toProto();
}
RrfRanker
倒數(shù)秩融合排序 (Reciprocal Rank Fusion)根據(jù)文檔的排序來計(jì)算分?jǐn)?shù)貢獻(xiàn)值,其計(jì)算公式如下
其中rank_constant
為常數(shù)值,默認(rèn)為60. ranki(doc)
為多向量檢索時(shí)該文檔在第i條向量召回結(jié)果中的排名,其中排名從1開始。
使用RrfRanker時(shí)只有排序起作用,原分?jǐn)?shù)不起作用,文檔的最終得分為單個(gè)檢索排名得到的分?jǐn)?shù)貢獻(xiàn)值之和。
class RrfRanker(BaseRanker):
def __init__(self, rank_constant: int = 60):
self.rank_constant = rank_constant
@Builder
@Getter
public class RrfRanker implements Ranker {
@Builder.Default
private int rankConstant = 60;
}
WeightedRanker
加權(quán)融合排序的計(jì)算公式如下:
加權(quán)融合排序?qū)τ趩未螜z索的分?jǐn)?shù)賦予一個(gè)權(quán)重。而由于不同距離計(jì)算得到的分?jǐn)?shù)的范圍差別很大,為了降低用戶設(shè)置權(quán)重的難度,我們在加權(quán)之前會(huì)對分?jǐn)?shù)做歸一化,歸一化后范圍為[0, 1],其中1表示距離最近,0表示距離最遠(yuǎn)。
使用WeightedRanker時(shí)只有原分?jǐn)?shù)起作用,排序不起作用,文檔的最終得分為權(quán)重與歸一化分?jǐn)?shù)的乘積之和。
權(quán)重保持默認(rèn)值時(shí)各個(gè)向量的權(quán)重均為1.0
;否則需要設(shè)置每個(gè)向量的權(quán)重,并且和檢索時(shí)的向量必須完全匹配。
class WeightedRanker(BaseRanker):
def __init__(self, weights: Optional[Dict[str, float]] = None):
self.weights = weights # 權(quán)重值,None表示各個(gè)向量的權(quán)重相同
@Builder
@Getter
public class WeightedRanker implements Ranker {
private Map<String, Float> weights; // 權(quán)重值,null表示各個(gè)向量的權(quán)重相同
}
其他
VectorValueType = Union[List[int], List[float], np.ndarray]
FieldDataType = Dict[str, Union[Type[str], Type[int], Type[float], Type[bool]]]
文檔內(nèi)容是否對您有幫助?