chore: metric name and group by extractor with CH and PromQL support (#9543)

This commit is contained in:
Abhishek Kumar Singh
2025-11-20 22:58:16 +05:30
committed by GitHub
parent 096e38ee91
commit 09cbe4aa0d
9 changed files with 2889 additions and 3 deletions

2
go.mod
View File

@@ -4,7 +4,7 @@ go 1.24.0
require (
dario.cat/mergo v1.0.1
github.com/AfterShip/clickhouse-sql-parser v0.4.11
github.com/AfterShip/clickhouse-sql-parser v0.4.16
github.com/ClickHouse/clickhouse-go/v2 v2.40.1
github.com/DATA-DOG/go-sqlmock v1.5.2
github.com/SigNoz/govaluate v0.0.0-20240203125216-988004ccc7fd

4
go.sum
View File

@@ -66,8 +66,8 @@ dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk=
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/AfterShip/clickhouse-sql-parser v0.4.11 h1:fZMKAjRmgzW44+hEhF6ywi4VjFZQjJ8QrFBbgBsjmF4=
github.com/AfterShip/clickhouse-sql-parser v0.4.11/go.mod h1:W0Z82wJWkJxz2RVun/RMwxue3g7ut47Xxl+SFqdJGus=
github.com/AfterShip/clickhouse-sql-parser v0.4.16 h1:gpl+wXclYUKT0p4+gBq22XeRYWwEoZ9f35vogqMvkLQ=
github.com/AfterShip/clickhouse-sql-parser v0.4.16/go.mod h1:W0Z82wJWkJxz2RVun/RMwxue3g7ut47Xxl+SFqdJGus=
github.com/Azure/azure-sdk-for-go v68.0.0+incompatible h1:fcYLmCpyNYRnvJbPerq7U0hS+6+I79yEDJBqVNcqUzU=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0 h1:Gt0j3wceWMwPmiazCa8MzMA0MfhmPIz0Qp0FJ6qcM0U=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0/go.mod h1:Ot/6aikWnKWi4l9QB7qVSwa8iMphQNqkWALMoNT3rzM=

View File

@@ -0,0 +1,687 @@
package queryfilterextractor
import (
"fmt"
"strings"
clickhouse "github.com/AfterShip/clickhouse-sql-parser/parser"
"github.com/SigNoz/signoz/pkg/errors"
)
const (
// MetricNameColumn is the column name used for filtering metrics
MetricNameColumn = "metric_name"
)
// ClickHouseFilterExtractor extracts metric names and grouping keys from ClickHouse SQL queries
type ClickHouseFilterExtractor struct{}
// NewClickHouseFilterExtractor creates a new ClickHouse filter extractor
func NewClickHouseFilterExtractor() *ClickHouseFilterExtractor {
return &ClickHouseFilterExtractor{}
}
// Extract parses a ClickHouse query and extracts metric names and grouping keys
func (e *ClickHouseFilterExtractor) Extract(query string) (*FilterResult, error) {
p := clickhouse.NewParser(query)
stmts, err := p.ParseStmts()
if err != nil {
return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "failed to parse clickhouse query: %s", err.Error())
}
result := &FilterResult{MetricNames: []string{}, GroupByColumns: []ColumnInfo{}}
metricNames := make(map[string]bool)
// Track top-level queries for GROUP BY extraction
topLevelQueries := make(map[*clickhouse.SelectQuery]bool)
// Process all statements
for _, stmt := range stmts {
selectQuery, ok := stmt.(*clickhouse.SelectQuery)
if !ok {
continue
}
// Mark as top-level
topLevelQueries[selectQuery] = true
// Walk the AST to extract metrics
clickhouse.Walk(selectQuery, func(node clickhouse.Expr) bool {
e.fillMetricNamesFromExpr(node, metricNames)
return true // Continue traversal
})
}
// Extract GROUP BY from the top-level queries by first building a map of CTEs and
// then recursively extracting the GROUP BY from the CTEs and subqueries.
// Build CTE map for all top-level queries
cteMap := make(map[string]*clickhouse.SelectQuery)
for query := range topLevelQueries {
e.buildCTEMap(query, cteMap)
}
// Extract GROUP BY with aliases and origins from the CTEs and subqueries using recursive approach
// Use a map to handle duplicates (last ColumnInfo wins across queries)
groupByColumnsMap := make(map[string]ColumnInfo) // column name -> ColumnInfo
visited := make(map[*clickhouse.SelectQuery]bool)
for query := range topLevelQueries {
columns, err := e.extractGroupByColumns(query, cteMap, visited)
if err != nil {
return nil, err
}
for _, col := range columns {
// Last column info wins for duplicate columns across multiple queries
groupByColumnsMap[col.Name] = col
}
}
// Convert sets to slices
for metric := range metricNames {
result.MetricNames = append(result.MetricNames, metric)
}
// Build GroupByColumns from the map
for _, colInfo := range groupByColumnsMap {
result.GroupByColumns = append(result.GroupByColumns, colInfo)
}
return result, nil
}
// ========================================
// Metric Name Extraction
// ========================================
// fillMetricNamesFromExpr extracts metric names from various node types
func (e *ClickHouseFilterExtractor) fillMetricNamesFromExpr(node clickhouse.Expr, metricNames map[string]bool) {
switch n := node.(type) {
case *clickhouse.BinaryOperation:
e.fillMetricFromBinaryOp(n, metricNames)
}
}
// fillMetricFromBinaryOp extracts metrics from binary operations
func (e *ClickHouseFilterExtractor) fillMetricFromBinaryOp(op *clickhouse.BinaryOperation, metricNames map[string]bool) {
// Check if left side is metric_name column
leftCol := e.getColumnName(op.LeftExpr)
rightCol := e.getColumnName(op.RightExpr)
// Handle metric_name on left side: metric_name = 'value'
if leftCol == MetricNameColumn {
e.fillMetricWithBinaryOpConditions(op, op.RightExpr, metricNames)
return
}
// Handle metric_name on right side: 'value' = metric_name
if rightCol == MetricNameColumn {
e.fillMetricWithBinaryOpConditions(op, op.LeftExpr, metricNames)
return
}
}
// fillMetricWithBinaryOpConditions extracts metric names from the value side of a binary operation
//
// Supported operators:
// - "=", "==": Extracts literal string values or values from any() function
// - "IN", "GLOBAL IN": Extracts all literal string values from the list
//
// Unsupported operators (can be added later if needed):
// - "!=", "<>", "NOT IN": Negative filters. (e.g., metric_name != 'a')
// - "LIKE", "ILIKE": Pattern matching filters
// - "NOT LIKE", "NOT ILIKE": Negative pattern matching filters
// - "OR", "AND": Boolean operators as the Walk function will automatically traverse both sides
// of OR/AND operations and extract metrics from each branch. (e.g., metric_name='a' OR metric_name='b')
func (e *ClickHouseFilterExtractor) fillMetricWithBinaryOpConditions(op *clickhouse.BinaryOperation, valueExpr clickhouse.Expr, metricNames map[string]bool) {
switch op.Operation {
case clickhouse.TokenKindSingleEQ, clickhouse.TokenKindDoubleEQ:
// metric_name = 'value' or metric_name = any(['a', 'b'])
// Skip if value side is a function call (function-wrapped literals are ignored, test case: CH59)
if fn, ok := valueExpr.(*clickhouse.FunctionExpr); ok {
// Only handle any() function, skip others like lowercase('cpu')
if fn.Name != nil && fn.Name.Name == "any" {
e.extractInValues(valueExpr, metricNames)
}
// Otherwise skip function-wrapped literals
} else if val := e.extractStringLiteral(valueExpr); val != "" {
metricNames[val] = true
}
case "IN", "GLOBAL IN":
// metric_name IN ('a', 'b', 'c')
// GLOBAL IN behaves the same as IN for metric extraction purposes
// Skip if value side is a function call (function-wrapped literals are ignored, test case: CH59)
if _, ok := valueExpr.(*clickhouse.FunctionExpr); !ok {
e.extractInValues(valueExpr, metricNames)
}
}
}
// extractStringLiteral extracts a string literal value from an expression
func (e *ClickHouseFilterExtractor) extractStringLiteral(expr clickhouse.Expr) string {
switch ex := expr.(type) {
case *clickhouse.StringLiteral:
return ex.Literal
}
return ""
}
// extractInValues extracts values from IN expressions
func (e *ClickHouseFilterExtractor) extractInValues(expr clickhouse.Expr, metricNames map[string]bool) {
// Find all string literals in the expression
strLits := clickhouse.FindAll(expr, func(node clickhouse.Expr) bool {
// metric_name passed in `in` condition will be string literal.
_, ok := node.(*clickhouse.StringLiteral)
return ok
})
for _, strLitNode := range strLits {
if strLit, ok := strLitNode.(*clickhouse.StringLiteral); ok {
// Unquote the string literal
val := e.extractStringLiteral(strLit)
if val != "" {
metricNames[val] = true
}
}
}
}
// ========================================
// GROUP BY Column Extraction
// ========================================
// extractGroupByColumns extracts the GROUP BY columns from a query
// It follows the top-down approach where outer GROUP BY overrides inner GROUP BY in subqueries and CTEs.
// Returns a slice of ColumnInfo with column names, aliases, and origins
func (e *ClickHouseFilterExtractor) extractGroupByColumns(query *clickhouse.SelectQuery, cteMap map[string]*clickhouse.SelectQuery, visited map[*clickhouse.SelectQuery]bool) ([]ColumnInfo, error) {
if visited[query] {
return nil, nil
}
// Mark this query as visited to prevent cycles
visited[query] = true
// First, check if this query has its own GROUP BY using direct field access
hasGroupBy := query.GroupBy != nil
// If this query has GROUP BY, use it (outer overrides inner)
if hasGroupBy {
// Extract GROUP BY columns
tempGroupBy := make(map[string]bool)
e.fillGroupsFromGroupByClause(query.GroupBy, tempGroupBy)
// Extract SELECT columns and their aliases from the same query level
selectAliases := e.extractSelectColumns(query)
// Build ColumnInfo array by matching GROUP BY with SELECT aliases and origins
result := []ColumnInfo{}
for groupByCol := range tempGroupBy {
alias := selectAliases[groupByCol] // Will be "" if not in SELECT
// Extract originExpr by tracing back through queries
originVisited := make(map[*clickhouse.SelectQuery]bool)
originExpr := e.extractColumnOrigin(groupByCol, query, cteMap, originVisited)
originField, err := extractCHOriginFieldFromQuery(fmt.Sprintf("SELECT %s", originExpr))
if err != nil {
return nil, err
}
result = append(result, ColumnInfo{
Name: groupByCol,
Alias: alias,
OriginExpr: originExpr,
OriginField: originField,
})
}
return result, nil
}
// If no GROUP BY in this query, follow CTE/subquery references
// It might have grouping inside the CTE/subquery
sourceQuery := e.extractSourceQuery(query, cteMap)
if sourceQuery != nil {
return e.extractGroupByColumns(sourceQuery, cteMap, visited)
}
return nil, nil
}
// fillGroupsFromGroupByClause extracts GROUP BY columns from a specific GroupByClause and fills the map with the column names
func (e *ClickHouseFilterExtractor) fillGroupsFromGroupByClause(groupByClause *clickhouse.GroupByClause, groupBy map[string]bool) {
// Extract GROUP BY expressions properly
// Find only the direct child ColumnExprList, not nested ones
// We use Find instead of FindAll to get only the first (direct child) ColumnExprList
exprListNode, foundList := clickhouse.Find(groupByClause, func(node clickhouse.Expr) bool {
_, ok := node.(*clickhouse.ColumnExprList)
return ok
})
if !foundList {
return
}
// Note: We only extract from the top-level ColumnExprList.Items to avoid extracting nested parts
// This prevents extracting 'timestamp' from 'toDate(timestamp)' - we only get 'toDate(timestamp)'
if exprList, ok := exprListNode.(*clickhouse.ColumnExprList); ok {
// Extract each expression from the list - these are top-level only
if exprList.Items != nil {
for _, item := range exprList.Items {
groupKey := e.extractColumnStrByExpr(item)
if groupKey != "" {
// Strip table alias if present (e.g., "m.region" -> "region")
groupKey = e.stripTableAlias(groupKey)
groupBy[groupKey] = true
}
}
}
}
}
// extractColumnStrByExpr extracts the complete string representation of different expression types
// Supports:
// - Ident: Simple identifier like "region" or "timestamp"
// - FunctionExpr: Function call like "toDate(timestamp)"
// - ColumnExpr: Column expression like "m.region", "toDate(timestamp)"
// - Other expression types: Return the string representation of the expression
//
// For example:
// - "region" -> "region"
// - "toDate(timestamp)" -> "toDate(timestamp)"
// - "`m.region`" -> "`m.region`"
func (e *ClickHouseFilterExtractor) extractColumnStrByExpr(expr clickhouse.Expr) string {
if expr == nil {
return ""
}
switch ex := expr.(type) {
// Ident is a simple identifier like "region" or "timestamp"
case *clickhouse.Ident:
// Handling for backticks which are native to ClickHouse and used for literal names.
// CH Parser removes the backticks from the identifier, so we need to add them back.
if ex.QuoteType == clickhouse.BackTicks {
return "`" + ex.Name + "`"
}
return ex.Name
// FunctionExpr is a function call like "toDate(timestamp)"
case *clickhouse.FunctionExpr:
// For function expressions, return the complete function call string
return ex.String()
// ColumnExpr is a column expression like "m.region", "toDate(timestamp)"
case *clickhouse.ColumnExpr:
// ColumnExpr wraps another expression - extract the underlying expression
if ex.Expr != nil {
return e.extractColumnStrByExpr(ex.Expr)
}
return ex.String()
default:
// For other expression types, return the string representation
return expr.String()
}
}
// stripTableAlias removes table alias prefix from a column name (e.g., "m.region" -> "region")
// but for literals with backticks, we need preserve the entire string. (e.g., `os.type` -> "os.type")
func (e *ClickHouseFilterExtractor) stripTableAlias(name string) string {
// Handling for backticks which are native to ClickHouse and used for literal names.
if strings.HasPrefix(name, "`") && strings.HasSuffix(name, "`") {
return strings.Trim(name, "`")
}
// split the name by dot and return the last part
parts := strings.Split(name, ".")
if len(parts) > 1 {
return parts[len(parts)-1]
}
return name
}
// getColumnName extracts column name from an expression
func (e *ClickHouseFilterExtractor) getColumnName(expr clickhouse.Expr) string {
switch ex := expr.(type) {
case *clickhouse.Ident:
return ex.Name
case *clickhouse.Path:
// Handle Path type for qualified column names like "m.metric_name"
// Extract the last field which is the column name
if len(ex.Fields) > 0 {
return ex.Fields[len(ex.Fields)-1].Name
}
return ""
}
return ""
}
// extractSourceQuery extracts the SelectQuery from FROM expressions
// Handles CTE references, subqueries, and table expressions
// For example: from the below query We'll try to extract the name of the source query
// which in the below case is "aggregated". Once we find it we return the SelectQuery node
// from the cteMap, which acts as the source for the GROUP BY extraction.
//
// WITH aggregated AS (
// SELECT region as region_alias, sum(value) AS total
// FROM metrics
// WHERE metric_name = 'cpu_usage'
// GROUP BY region
// )
// SELECT * FROM aggregated
func (e *ClickHouseFilterExtractor) extractSourceQuery(query *clickhouse.SelectQuery, cteMap map[string]*clickhouse.SelectQuery) *clickhouse.SelectQuery {
if query.From == nil {
return nil
}
// Find the FROM clause and extract the source
fromExprs := clickhouse.FindAll(query.From, func(node clickhouse.Expr) bool {
switch node.(type) {
case *clickhouse.Ident, *clickhouse.SelectQuery:
return true
}
return false
})
for _, fromExpr := range fromExprs {
switch expr := fromExpr.(type) {
case *clickhouse.Ident:
// CTE reference by simple name
if cteQuery, exists := cteMap[expr.Name]; exists {
return cteQuery
}
case *clickhouse.SelectQuery:
// Direct subquery
return expr
}
}
return nil
}
// ========================================
// Column Origin Tracing
// ========================================
// extractColumnOrigin recursively traces a column back to its original expression
// Returns the original expression string (e.g., "JSONExtractString(labels, 'service.name')")
// or the column name itself if it's a direct column reference
func (e *ClickHouseFilterExtractor) extractColumnOrigin(
columnName string,
query *clickhouse.SelectQuery,
cteMap map[string]*clickhouse.SelectQuery,
visited map[*clickhouse.SelectQuery]bool,
) string {
if query == nil {
return columnName
}
// Prevent infinite recursion and redundant work
// Once a query is visited, we don't need to check it again
if visited[query] {
return columnName
}
visited[query] = true
// this is to prevent infinite recursion in a single query search
// but we don't want this to affect the other queries searches
// so we delete it after the search is done for current query
defer delete(visited, query)
// Step 1: Search in CTE and Joins, this will take us to very end of the SubQueries and CTE
sourceQuery := e.extractSourceQuery(query, cteMap)
if sourceQuery != nil {
returningOrigin := e.extractColumnOrigin(columnName, sourceQuery, cteMap, visited)
if returningOrigin != columnName {
return returningOrigin
}
}
// Step 2: Once we're sure there are no SubQueries and CTE we just find all the selectItem
// and then get their column origin values
selectItems := clickhouse.FindAll(query, func(node clickhouse.Expr) bool {
_, ok := node.(*clickhouse.SelectItem)
return ok
})
// extractOriginFromSelectItem extracts the origin from a SelectItem
extractOriginFromSelectItem := func(selectItem *clickhouse.SelectItem) *string {
// Check if this SelectItem matches our column (by alias or by name)
alias := e.extractSelectItemAlias(selectItem)
exprStr := e.extractSelectItemName(selectItem)
normalizedExpr := e.stripTableAlias(exprStr)
// Case 1: Column matches an alias in SELECT
if alias == columnName {
// This is an alias - get the expression it's aliasing
if selectItem.Expr != nil {
originExpr := e.extractFullExpression(selectItem.Expr)
// If the expression is just a column name, trace it back further
if normalizedExpr == columnName || e.isSimpleColumnReference(selectItem.Expr) {
// It's referencing another column - trace back through source query
sourceQuery := e.extractSourceQuery(query, cteMap)
if sourceQuery != nil {
originExpr := e.extractColumnOrigin(normalizedExpr, sourceQuery, cteMap, visited)
return &originExpr
}
}
return &originExpr
}
}
// Case 2: Column matches the expression itself (no alias)
if normalizedExpr == columnName {
// Check if this is a simple column reference or a complex expression
if e.isSimpleColumnReference(selectItem.Expr) {
// Simple column - trace back through source query
sourceQuery := e.extractSourceQuery(query, cteMap)
if sourceQuery != nil {
originExpr := e.extractColumnOrigin(columnName, sourceQuery, cteMap, visited)
return &originExpr
}
return &columnName
} else {
// Complex expression - return it as origin
originExpr := e.extractFullExpression(selectItem.Expr)
return &originExpr
}
}
return nil
}
var finalColumnOrigin string
for _, itemNode := range selectItems {
if selectItem, ok := itemNode.(*clickhouse.SelectItem); ok {
// We call the extractOriginFromSelectItem function for each SelectItem
// and if the origin is not nil, we set the finalColumnOrigin to the origin
// this has to be done to get to the most nested origin of column where selectItem is present
origin := extractOriginFromSelectItem(selectItem)
if origin != nil {
finalColumnOrigin = *origin
}
}
}
if finalColumnOrigin != "" {
return finalColumnOrigin
}
return columnName
}
// extractFullExpression extracts the complete string representation of an expression
func (e *ClickHouseFilterExtractor) extractFullExpression(expr clickhouse.Expr) string {
if expr == nil {
return ""
}
return expr.String()
}
// isSimpleColumnReference checks if an expression is just a simple column reference
// (not a function call or complex expression)
func (e *ClickHouseFilterExtractor) isSimpleColumnReference(expr clickhouse.Expr) bool {
if expr == nil {
return false
}
switch ex := expr.(type) {
case *clickhouse.Ident:
// backticks are treated as non simple column reference
// so that we can return the origin expression with backticks
// origin parser will handle the backticks and extract the column name from it
if ex.QuoteType == clickhouse.BackTicks {
return false
}
return true
case *clickhouse.Path:
return true
case *clickhouse.ColumnExpr:
// Check if it wraps a simple reference
if ex.Expr != nil {
return e.isSimpleColumnReference(ex.Expr)
}
}
return false
}
// ========================================
// SELECT Column Alias Extraction
// ========================================
// extractSelectColumns extracts column names and their aliases from SELECT clause of a specific query
// Returns a map where key is normalized column name and value is the alias
// For duplicate columns with different aliases, the last alias wins
// This follows the same pattern as extractGroupFromGroupByClause - finding direct children only
func (e *ClickHouseFilterExtractor) extractSelectColumns(query *clickhouse.SelectQuery) map[string]string {
aliasMap := make(map[string]string)
if query == nil {
return aliasMap
}
// Find SelectItem nodes which represent columns in the SELECT clause
// SelectItem has an Expr field (the column/expression) and an Alias field
selectItems := clickhouse.FindAll(query, func(node clickhouse.Expr) bool {
_, ok := node.(*clickhouse.SelectItem)
return ok
})
// Process each SelectItem and extract column name and alias
for _, itemNode := range selectItems {
if selectItem, ok := itemNode.(*clickhouse.SelectItem); ok {
// Extract the column name/expression from SelectItem.Expr
columnName := e.extractSelectItemName(selectItem)
if columnName == "" {
continue
}
// Normalize column name (strip table alias)
normalizedName := e.stripTableAlias(columnName)
// Extract alias from SelectItem.Alias
alias := e.extractSelectItemAlias(selectItem)
// Store in map - last alias wins for duplicates
aliasMap[normalizedName] = alias
}
}
return aliasMap
}
// extractSelectItemName extracts the column name or expression from a SelectItem
func (e *ClickHouseFilterExtractor) extractSelectItemName(selectItem *clickhouse.SelectItem) string {
if selectItem == nil || selectItem.Expr == nil {
return ""
}
return e.extractColumnStrByExpr(selectItem.Expr)
}
// extractSelectItemAlias extracts the alias from a SelectItem
// Returns empty string if no alias is present
func (e *ClickHouseFilterExtractor) extractSelectItemAlias(selectItem *clickhouse.SelectItem) string {
if selectItem == nil || selectItem.Alias == nil {
return ""
}
// The Alias field is an *Ident (pointer type)
if selectItem.Alias.Name != "" {
return selectItem.Alias.Name
}
return ""
}
// ========================================
// CTE and Subquery Extraction
// ========================================
// buildCTEMap builds a map of CTE names to their SelectQuery nodes by recursively
// traversing all queries and their nested expressions
func (e *ClickHouseFilterExtractor) buildCTEMap(query *clickhouse.SelectQuery, cteMap map[string]*clickhouse.SelectQuery) {
// Access CTEs directly from WithClause if it exists
if query.With != nil && query.With.CTEs != nil {
for _, cte := range query.With.CTEs {
cteName := e.extractCTEName(cte)
cteQuery := e.extractCTEQuery(cte)
if cteName != "" && cteQuery != nil {
cteMap[cteName] = cteQuery
// Recursively build CTE map for nested CTEs
e.buildCTEMap(cteQuery, cteMap)
}
}
}
// Also check for CTEs in subqueries and other expressions
e.buildCTEMapFromExpr(query, cteMap)
}
// extractCTEName extracts the CTE name from a CTEStmt, the Expr field is the name of the CTE
func (e *ClickHouseFilterExtractor) extractCTEName(cte *clickhouse.CTEStmt) string {
if cte == nil || cte.Expr == nil {
return ""
}
switch name := cte.Expr.(type) {
case *clickhouse.Ident:
return name.Name
default:
return cte.Expr.String()
}
}
// extractCTEQuery extracts the SelectQuery from a CTEStmt, the Alias field is the SelectQuery
func (e *ClickHouseFilterExtractor) extractCTEQuery(cte *clickhouse.CTEStmt) *clickhouse.SelectQuery {
if cte == nil || cte.Alias == nil {
return nil
}
// The Alias field should contain a SelectQuery
if selectQuery, ok := cte.Alias.(*clickhouse.SelectQuery); ok {
return selectQuery
}
return nil
}
// buildCTEMapFromExpr recursively extracts CTEs from various expression types
func (e *ClickHouseFilterExtractor) buildCTEMapFromExpr(expr clickhouse.Expr, cteMap map[string]*clickhouse.SelectQuery) {
// Walk through all nodes to find SelectQuery nodes that might contain CTEs
clickhouse.Walk(expr, func(node clickhouse.Expr) bool {
switch n := node.(type) {
case *clickhouse.SelectQuery:
// Don't process the same query we started with to avoid infinite recursion
if n != expr {
e.buildCTEMap(n, cteMap)
}
case *clickhouse.TableExpr:
if n.Expr != nil {
e.buildCTEMapFromExpr(n.Expr, cteMap)
}
case *clickhouse.JoinTableExpr:
if n.Table != nil {
e.buildCTEMapFromExpr(n.Table, cteMap)
}
}
return true // Continue traversal
})
}

View File

@@ -0,0 +1,305 @@
package queryfilterextractor
import (
"strings"
"github.com/AfterShip/clickhouse-sql-parser/parser"
"github.com/SigNoz/signoz/pkg/errors"
)
// excludedFunctions contains functions that should cause ExtractOriginField to return empty string.
// Map key is the function name in lowercase, value is the original function name.
var excludedFunctions = map[string]string{
// Time functions
"now": "now",
"today": "today",
"yesterday": "yesterday",
"todatetime": "toDateTime",
"todatetime64": "toDateTime64",
"todate": "toDate",
"todate32": "toDate32",
"tostartofinterval": "toStartOfInterval",
"tostartofday": "toStartOfDay",
"tostartofweek": "toStartOfWeek",
"tostartofmonth": "toStartOfMonth",
"tostartofquarter": "toStartOfQuarter",
"tostartofyear": "toStartOfYear",
"tostartofhour": "toStartOfHour",
"tostartofminute": "toStartOfMinute",
"tostartofsecond": "toStartOfSecond",
"tostartoffiveminutes": "toStartOfFiveMinutes",
"tostartoftenminutes": "toStartOfTenMinutes",
"tostartoffifteenminutes": "toStartOfFifteenMinutes",
"tointervalsecond": "toIntervalSecond",
"tointervalminute": "toIntervalMinute",
"tointervalhour": "toIntervalHour",
"tointervalday": "toIntervalDay",
"tointervalweek": "toIntervalWeek",
"tointervalmonth": "toIntervalMonth",
"tointervalquarter": "toIntervalQuarter",
"tointervalyear": "toIntervalYear",
"parsedatetime": "parseDateTime",
"parsedatetimebesteffort": "parseDateTimeBestEffort",
// Aggregate functions
"count": "count",
"sum": "sum",
"avg": "avg",
"min": "min",
"max": "max",
"any": "any",
"stddevpop": "stddevPop",
"stddevsamp": "stddevSamp",
"varpop": "varPop",
"varsamp": "varSamp",
"grouparray": "groupArray",
"groupuniqarray": "groupUniqArray",
"quantile": "quantile",
"quantiles": "quantiles",
"quantileexact": "quantileExact",
"quantiletiming": "quantileTiming",
"median": "median",
"uniq": "uniq",
"uniqexact": "uniqExact",
"uniqcombined": "uniqCombined",
"uniqhll12": "uniqHLL12",
"topk": "topK",
"first": "first",
"last": "last",
}
// jsonExtractFunctions contains functions that extract from JSON columns.
// Map key is the function name in lowercase, value is the original function name.
var jsonExtractFunctions = map[string]string{
"jsonextractstring": "JSONExtractString",
"jsonextractint": "JSONExtractInt",
"jsonextractuint": "JSONExtractUInt",
"jsonextractfloat": "JSONExtractFloat",
"jsonextractbool": "JSONExtractBool",
"jsonextract": "JSONExtract",
"jsonextractraw": "JSONExtractRaw",
"jsonextractarrayraw": "JSONExtractArrayRaw",
"jsonextractkeysandvalues": "JSONExtractKeysAndValues",
}
// isFunctionPresentInStore checks if a function name exists in the function store map
func isFunctionPresentInStore(funcName string, funcStore map[string]string) bool {
_, exists := funcStore[strings.ToLower(funcName)]
return exists
}
// isReservedSelectKeyword checks if a keyword is a reserved keyword for the SELECT statement
// We're only including those which can appear in the SELECT statement without being quoted
func isReservedSelectKeyword(keyword string) bool {
return strings.ToUpper(keyword) == parser.KeywordSelect || strings.ToUpper(keyword) == parser.KeywordFrom
}
// extractCHOriginField extracts the origin field (column name) from a query string
// or fields getting extracted in case of JSON extraction functions.
func extractCHOriginFieldFromQuery(query string) (string, error) {
// Parse the query string
p := parser.NewParser(query)
stmts, err := p.ParseStmts()
if err != nil {
return "", errors.NewInternalf(errors.CodeInternal, "failed to parse origin field from query: %s", err.Error())
}
// Get the first statement which should be a SELECT
selectStmt := stmts[0].(*parser.SelectQuery)
// If query has multiple select items, return blank string as we don't expect multiple select items
if len(selectStmt.SelectItems) > 1 {
return "", nil
}
if len(selectStmt.SelectItems) == 0 {
return "", errors.NewInternalf(errors.CodeInternal, "SELECT query has no select items")
}
// Extract origin field from the first (and only) select item's expression
return extractOriginFieldFromExpr(selectStmt.SelectItems[0].Expr)
}
// extractOriginFieldFromExpr extracts the origin field (column name) from an expression.
// This is the internal helper function that contains the original logic.
func extractOriginFieldFromExpr(expr parser.Expr) (string, error) {
// Check if expression contains excluded functions or IF/CASE
hasExcludedExpressions := false
hasReservedKeyword := false
parser.Walk(expr, func(node parser.Expr) bool {
// exclude reserved keywords because the parser will treat them as valid SQL
// example: SELECT FROM table here the "FROM" is a reserved keyword,
// but the parser will treat it as valid column to be extracted.
if ident, ok := node.(*parser.Ident); ok {
if ident.QuoteType == parser.Unquoted && isReservedSelectKeyword(ident.Name) {
hasReservedKeyword = true
return false
}
}
// for functions, we need to check if the function is excluded function or a JSON extraction function with nested JSON extraction
if funcExpr, ok := node.(*parser.FunctionExpr); ok {
if isFunctionPresentInStore(funcExpr.Name.Name, excludedFunctions) {
hasExcludedExpressions = true
return false
}
// Check for nested JSON extraction functions
if isFunctionPresentInStore(funcExpr.Name.Name, jsonExtractFunctions) {
// Check if any argument contains another JSON extraction function
if funcExpr.Params != nil && funcExpr.Params.Items != nil {
for _, arg := range funcExpr.Params.Items.Items {
if containsJSONExtractFunction(arg) {
hasExcludedExpressions = true
return false
}
}
}
}
}
if _, ok := node.(*parser.CaseExpr); ok {
hasExcludedExpressions = true
return false
}
return true
})
// If the expression contains reserved keywords, return error
if hasReservedKeyword {
return "", errors.New(errors.TypeUnsupported, errors.CodeUnsupported, "reserved keyword found in select clause")
}
// If the expression contains excluded expressions, return empty string
if hasExcludedExpressions {
return "", nil
}
// Extract all column names from the expression
columns := extractColumns(expr)
// If we found exactly one unique column, return it
if len(columns) == 1 {
return columns[0], nil
}
// Multiple columns or no columns - return empty string
return "", nil
}
// containsJSONExtractFunction checks if an expression contains a JSON extraction function
func containsJSONExtractFunction(expr parser.Expr) bool {
if expr == nil {
return false
}
found := false
parser.Walk(expr, func(node parser.Expr) bool {
if funcExpr, ok := node.(*parser.FunctionExpr); ok {
if isFunctionPresentInStore(funcExpr.Name.Name, jsonExtractFunctions) {
found = true
return false
}
}
return true
})
return found
}
// extractColumns recursively extracts all unique column names from an expression.
// Note: String literals are also considered as origin fields and will be included in the result.
func extractColumns(expr parser.Expr) []string {
columnMap := make(map[string]bool)
extractColumnsHelper(expr, columnMap)
// Convert map to slice
columns := make([]string, 0, len(columnMap))
for col := range columnMap {
columns = append(columns, col)
}
return columns
}
// extractColumnsHelper is a recursive helper that finds all column references.
// Note: String literals are also considered as origin fields and will be added to the columnMap.
func extractColumnsHelper(expr parser.Expr, columnMap map[string]bool) {
switch n := expr.(type) {
// Ident is a simple identifier like "region" or "timestamp"
case *parser.Ident:
// Add identifiers as column references
columnMap[n.Name] = true
// FunctionExpr is a function call like "toDate(timestamp)", "JSONExtractString(labels, 'service.name')"
case *parser.FunctionExpr:
// Special handling for JSON extraction functions
// In case of nested JSON extraction, we return blank values (handled at top level)
if isFunctionPresentInStore(n.Name.Name, jsonExtractFunctions) {
// For JSON functions, extract from the second argument (the JSON path/key being extracted)
// The first argument is the column name, the second is the exact data being extracted
// The extracted data (second argument) is treated as the origin field
if n.Params != nil && n.Params.Items != nil && len(n.Params.Items.Items) >= 2 {
secondArg := n.Params.Items.Items[1]
// If the second argument is a string literal, use its value as the origin field
// String literals are considered as origin fields
if strLit, ok := secondArg.(*parser.StringLiteral); ok {
columnMap[strLit.Literal] = true
} else {
// Otherwise, try to extract columns from it
extractColumnsHelper(secondArg, columnMap)
}
}
return
}
// For regular functions, recursively process all arguments, ex: lower(name)
if n.Params != nil && n.Params.Items != nil {
for _, item := range n.Params.Items.Items {
extractColumnsHelper(item, columnMap)
}
}
// BinaryOperation is a binary operation like "region = 'us-east-1'" or "unix_milli / 1000"
case *parser.BinaryOperation:
extractColumnsHelper(n.LeftExpr, columnMap)
extractColumnsHelper(n.RightExpr, columnMap)
// ColumnExpr is a column expression like "m.region", "service.name"
case *parser.ColumnExpr:
extractColumnsHelper(n.Expr, columnMap)
// CastExpr is a cast expression like "CAST(unix_milli AS String)"
case *parser.CastExpr:
extractColumnsHelper(n.Expr, columnMap)
case *parser.ParamExprList:
if n.Items != nil {
extractColumnsHelper(n.Items, columnMap)
}
// Ex: coalesce(cpu_usage, 0) + coalesce(mem_usage, 0)
case *parser.ColumnExprList:
for _, item := range n.Items {
extractColumnsHelper(item, columnMap)
}
// StringLiteral is a string literal like "us-east-1" or "cpu.usage"
case *parser.StringLiteral:
// String literals are considered as origin fields
columnMap[n.Literal] = true
return
// Support for columns like table.column_name
case *parser.Path:
if len(n.Fields) > 0 {
extractColumnsHelper(n.Fields[len(n.Fields)-1], columnMap)
}
return
// Add more cases as needed for other expression types
default:
// For unknown types, return empty (don't extract columns)
return
}
}

View File

@@ -0,0 +1,237 @@
package queryfilterextractor
import (
"testing"
)
func TestExtractOriginField(t *testing.T) {
tests := []struct {
name string
query string
expected string
expectError bool
}{
// JSON extraction functions - should return the second argument (JSON path/key) as origin field
{
name: "JSONExtractString simple",
query: `SELECT JSONExtractString(labels, 'service.name')`,
expected: "service.name",
},
{
name: "JSONExtractInt",
query: `SELECT JSONExtractInt(labels, 'status.code')`,
expected: "status.code",
},
{
name: "JSONExtractFloat",
query: `SELECT JSONExtractFloat(labels, 'cpu.usage')`,
expected: "cpu.usage",
},
{
name: "JSONExtractBool",
query: `SELECT JSONExtractBool(labels, 'feature.enabled')`,
expected: "feature.enabled",
},
{
name: "JSONExtractString with function wrapper",
query: `SELECT lower(JSONExtractString(labels, 'user.email'))`,
expected: "user.email",
},
{
name: "Nested JSON extraction",
query: `SELECT JSONExtractInt(JSONExtractRaw(labels, 'meta'), 'status.code')`,
expected: "", // Nested JSON extraction should return blank
},
// Nested functions - should return the deepest column
{
name: "Nested time functions with column",
query: `SELECT toStartOfInterval(toDateTime(intDiv(unix_milli, 1000)), toIntervalSecond(60))`,
expected: "", // Contains toStartOfInterval and toDateTime which are excluded
},
{
name: "Division with column",
query: `SELECT unix_milli / 1000`,
expected: "unix_milli",
},
{
name: "Function with single column",
query: `SELECT lower(unix_milli)`,
expected: "unix_milli",
},
{
name: "CAST with single column",
query: `SELECT CAST(unix_milli AS String)`,
expected: "unix_milli",
},
{
name: "intDiv with single column",
query: `SELECT intDiv(unix_milli, 1000)`,
expected: "unix_milli",
},
// Multiple columns - should return blank
{
name: "Multiple columns in coalesce",
query: `SELECT (coalesce(cpu_usage, 0) + coalesce(mem_usage, 0)) / 2`,
expected: "",
},
{
name: "Multiple columns in arithmetic",
query: `SELECT cpu_usage + mem_usage`,
expected: "",
},
{
name: "Multiple columns in function",
query: `SELECT concat(first_name, last_name)`,
expected: "",
},
// IF/CASE conditions - should return blank
{
name: "IF with single column in condition",
query: `SELECT IF(error_count > 0, service, 'healthy')`,
expected: "", // Multiple columns: error_count and service
},
{
name: "IF with JSON and multiple columns",
query: `SELECT if(JSONExtractInt(metadata, 'retry.count') > 3, toLower(JSONExtractString(metadata, 'user.id')), hostname)`,
expected: "", // Multiple columns: metadata and hostname
},
{
name: "String literal should return string",
query: `SELECT 'constant'`,
expected: "constant",
},
// No columns - should return blank
{
name: "Number literal",
query: `SELECT 42`,
expected: "",
},
{
name: "Multiple literals",
query: `SELECT 'constant', 42`,
expected: "",
},
{
name: "Multiple string literals",
query: `SELECT 'constant', '42'`,
expected: "",
},
// Excluded functions - should return blank
{
name: "now() function",
query: `SELECT now()`,
expected: "",
},
{
name: "today() function",
query: `SELECT today()`,
expected: "",
},
{
name: "count aggregate",
query: `SELECT count(user_id)`,
expected: "",
},
{
name: "sum aggregate",
query: `SELECT sum(amount)`,
expected: "",
},
// Single column simple cases
{
name: "Simple column reference",
query: `SELECT user_id`,
expected: "user_id",
},
{
name: "Column with alias",
query: `SELECT user_id AS id`,
expected: "user_id",
},
{
name: "Column in arithmetic with literals (multiplication)",
query: `SELECT unix_milli * 1000`,
expected: "unix_milli",
},
// Edge cases
{
name: "Nested functions with single column deep",
query: `SELECT upper(lower(trim(column_name)))`,
expected: "column_name",
},
// Qualified column names (Path)
{
name: "Column with table prefix",
query: `SELECT table.column_name`,
expected: "column_name", // IndexOperation: extracts column name from Index field
},
{
name: "Qualified column in function",
query: `SELECT lower(table.column_name)`,
expected: "column_name",
},
{
name: "Qualified column in arithmetic",
query: `SELECT table.column_name * 100`,
expected: "column_name",
},
{
name: "Nested qualified column (schema.table.column)",
query: `SELECT schema.table.column_name`,
expected: "column_name", // Should extract the final column name
},
{
name: "Multiple qualified columns",
query: `SELECT table1.column1 + table2.column2`,
expected: "", // Multiple columns: column1 and column2
},
{
name: "Qualified column with CAST",
query: `SELECT CAST(table.column_name AS String)`,
expected: "column_name",
},
{
name: "Multiple select items - return blank",
query: `SELECT JSONExtractString(labels, 'service.name'), unix_milli / 1000, cpu_usage + mem_usage`,
expected: "",
},
// Error cases
{
name: "Invalid SQL syntax",
query: `SELECT FROM table`,
expectError: true,
},
{
name: "Malformed query",
query: `SELECT * FROM`,
expectError: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := extractCHOriginFieldFromQuery(tt.query)
if tt.expectError {
if err == nil {
t.Errorf("ExtractOriginField() expected error but got nil, result = %q", result)
}
} else {
if err != nil {
t.Errorf("ExtractOriginField() unexpected error: %v", err)
}
if result != tt.expected {
t.Errorf("ExtractOriginField() = %q, want %q", result, tt.expected)
}
}
})
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,115 @@
package queryfilterextractor
import (
"github.com/SigNoz/signoz/pkg/errors"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/promql/parser"
)
// PromQLFilterExtractor extracts metric names and grouping keys from PromQL queries
type PromQLFilterExtractor struct{}
// NewPromQLFilterExtractor creates a new PromQL filter extractor
func NewPromQLFilterExtractor() *PromQLFilterExtractor {
return &PromQLFilterExtractor{}
}
// Extract parses a PromQL query and extracts metric names and grouping keys
func (e *PromQLFilterExtractor) Extract(query string) (*FilterResult, error) {
expr, err := parser.ParseExpr(query)
if err != nil {
return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "failed to parse promql query: %s", err.Error())
}
result := &FilterResult{
MetricNames: []string{},
GroupByColumns: []ColumnInfo{},
}
// Use a visitor to traverse the AST
visitor := &promQLVisitor{
metricNames: make(map[string]bool),
groupBy: make(map[string]bool),
}
// Walk the AST
if err := parser.Walk(visitor, expr, nil); err != nil {
return result, errors.NewInternalf(errors.CodeInternal, "failed to walk promql query: %s", err.Error())
}
// Convert sets to slices
for metric := range visitor.metricNames {
result.MetricNames = append(result.MetricNames, metric)
}
for groupKey := range visitor.groupBy {
result.GroupByColumns = append(result.GroupByColumns, ColumnInfo{Name: groupKey, OriginExpr: groupKey, OriginField: groupKey})
}
return result, nil
}
// promQLVisitor implements the parser.Visitor interface
type promQLVisitor struct {
metricNames map[string]bool
groupBy map[string]bool
// Track if we've already captured grouping from an outermost aggregation
hasOutermostGrouping bool
}
func (v *promQLVisitor) Visit(node parser.Node, path []parser.Node) (parser.Visitor, error) {
switch n := node.(type) {
case *parser.VectorSelector:
v.visitVectorSelector(n)
case *parser.AggregateExpr:
v.visitAggregateExpr(n, path)
}
return v, nil
}
// visitVectorSelector will be called whenever the Visitor encounters a VectorSelector node.
// in the case we'll be extracting the metric names from the vector selector.
func (v *promQLVisitor) visitVectorSelector(vs *parser.VectorSelector) {
// Check if metric name is specified directly
if vs.Name != "" {
v.metricNames[vs.Name] = true
}
// Check for __name__ label matcher
for _, matcher := range vs.LabelMatchers {
if matcher.Name == labels.MetricName {
switch matcher.Type {
case labels.MatchEqual:
v.metricNames[matcher.Value] = true
// Skip for negative filters - negative filters don't extract metric names
// case labels.MatchNotEqual, labels.MatchRegexp, labels.MatchNotRegexp:
}
}
}
}
// visitAggregateExpr will be called whenever the Visitor encounters an AggregateExpr node.
// in the case we'll be extracting the grouping keys from the outermost aggregation.
func (v *promQLVisitor) visitAggregateExpr(ae *parser.AggregateExpr, path []parser.Node) {
// Count how many AggregateExpr nodes are in the path (excluding current node)
// This tells us the nesting level
nestingLevel := 0
for _, p := range path {
if _, ok := p.(*parser.AggregateExpr); ok {
nestingLevel++
}
}
// Only capture grouping from the outermost aggregation (nesting level 0)
if nestingLevel == 0 && !v.hasOutermostGrouping {
// If Without is true, we skip grouping per spec
if !ae.Without && len(ae.Grouping) > 0 {
v.hasOutermostGrouping = true
for _, label := range ae.Grouping {
v.groupBy[label] = true
}
}
}
// Continue traversal to find metrics in the expression
}

View File

@@ -0,0 +1,205 @@
package queryfilterextractor
import (
"reflect"
"testing"
)
func TestPromQLFilterExtractor_Extract(t *testing.T) {
extractor := NewPromQLFilterExtractor()
tests := []struct {
name string
query string
wantMetrics []string
wantGroupByColumns []ColumnInfo
wantError bool
}{
{
name: "P1 - Simple vector selector",
query: `http_requests_total{job="api"}`,
wantMetrics: []string{"http_requests_total"},
wantGroupByColumns: []ColumnInfo{},
},
{
name: "P2 - Function call",
query: `rate(cpu_usage_seconds_total[5m])`,
wantMetrics: []string{"cpu_usage_seconds_total"},
wantGroupByColumns: []ColumnInfo{},
},
{
name: "P3 - Aggregation with by()",
query: `sum by (pod,region) (rate(http_requests_total[5m]))`,
wantMetrics: []string{"http_requests_total"},
wantGroupByColumns: []ColumnInfo{{Name: "pod", OriginExpr: "pod", OriginField: "pod"}, {Name: "region", OriginExpr: "region", OriginField: "region"}},
},
{
name: "P4 - Aggregation with without()",
query: `sum without (instance) (rate(cpu_usage_total[1m]))`,
wantMetrics: []string{"cpu_usage_total"},
wantGroupByColumns: []ColumnInfo{}, // without() means no grouping keys per spec
},
{
name: "P5 - Invalid: metric name set twice",
query: `sum(rate(http_requests_total{__name__!="http_requests_error_total"}[5m]))`,
wantMetrics: []string{},
wantGroupByColumns: []ColumnInfo{},
wantError: true,
},
{
name: "P6 - Regex negative label",
query: `sum(rate(http_requests_total{status!~"5.."}[5m]))`,
wantMetrics: []string{"http_requests_total"},
wantGroupByColumns: []ColumnInfo{},
},
{
name: "P7 - Nested aggregations",
query: `sum by (region) (max by (pod, region) (cpu_usage_total{env="prod"}))`,
wantMetrics: []string{"cpu_usage_total"},
wantGroupByColumns: []ColumnInfo{{Name: "region", OriginExpr: "region", OriginField: "region"}}, // Only outermost grouping
},
{
name: "P7a - Nested aggregation: inner grouping ignored",
query: `sum(max by (pod) (cpu_usage_total{env="prod"}))`,
wantMetrics: []string{"cpu_usage_total"},
wantGroupByColumns: []ColumnInfo{}, // Inner grouping is ignored when outer has no grouping (nestingLevel != 0 case)
},
{
name: "P8 - Arithmetic expression",
query: `(http_requests_total{job="api"} + http_errors_total{job="api"})`,
wantMetrics: []string{"http_requests_total", "http_errors_total"},
wantGroupByColumns: []ColumnInfo{},
},
{
name: "P9 - Mix of positive metric & exclusion label",
query: `sum by (region)(rate(foo{job!="db"}[5m]))`,
wantMetrics: []string{"foo"},
wantGroupByColumns: []ColumnInfo{{Name: "region", OriginExpr: "region", OriginField: "region"}},
},
{
name: "P10 - Function + aggregation",
query: `histogram_quantile(0.9, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))`,
wantMetrics: []string{"http_request_duration_seconds_bucket"},
wantGroupByColumns: []ColumnInfo{{Name: "le", OriginExpr: "le", OriginField: "le"}},
},
{
name: "P11 - Subquery",
query: `sum_over_time(cpu_usage_total[1h:5m])`,
wantMetrics: []string{"cpu_usage_total"},
wantGroupByColumns: []ColumnInfo{},
},
{
name: "P12 - Nested aggregation inside subquery",
query: `max_over_time(sum(rate(cpu_usage_total[5m]))[1h:5m])`,
wantMetrics: []string{"cpu_usage_total"},
wantGroupByColumns: []ColumnInfo{},
},
{
name: "P13 - Subquery with multiple metrics",
query: `avg_over_time((foo + bar)[10m:1m])`,
wantMetrics: []string{"foo", "bar"},
wantGroupByColumns: []ColumnInfo{},
},
{
name: "P14 - Simple meta-metric",
query: `sum by (pod) (up)`,
wantMetrics: []string{"up"},
wantGroupByColumns: []ColumnInfo{{Name: "pod", OriginExpr: "pod", OriginField: "pod"}},
},
{
name: "P15 - Binary operator unless",
query: `sum(rate(http_requests_total[5m])) unless avg(rate(http_errors_total[5m]))`,
wantMetrics: []string{"http_requests_total", "http_errors_total"},
wantGroupByColumns: []ColumnInfo{},
},
{
name: "P16 - Vector matching",
query: `sum(rate(foo[5m])) / ignoring(instance) group_left(job) sum(rate(bar[5m]))`,
wantMetrics: []string{"foo", "bar"},
wantGroupByColumns: []ColumnInfo{},
},
{
name: "P17 - Offset modifier with aggregation",
query: `sum by (env)(rate(cpu_usage_seconds_total{job="api"}[5m] offset 1h))`,
wantMetrics: []string{"cpu_usage_seconds_total"},
wantGroupByColumns: []ColumnInfo{{Name: "env", OriginExpr: "env", OriginField: "env"}},
},
{
name: "P18 - Invalid syntax",
query: `sum by ((foo)(bar))(http_requests_total)`,
wantMetrics: []string{},
wantGroupByColumns: []ColumnInfo{},
wantError: true,
},
{
name: "P19 - Literal expression",
query: `2 + 3`,
wantMetrics: []string{},
wantGroupByColumns: []ColumnInfo{},
},
{
name: "P20 - Aggregation inside subquery with deriv",
query: `deriv(sum by (instance)(rate(node_network_receive_bytes_total[5m]))[30m:5m])`,
wantMetrics: []string{"node_network_receive_bytes_total"},
wantGroupByColumns: []ColumnInfo{{Name: "instance", OriginExpr: "instance", OriginField: "instance"}}, // Aggregation is inside subquery, not outermost
},
{
name: "P21 - Aggregation inside subquery with avg_over_time",
query: `avg_over_time(sum by (job)(rate(http_requests_total[1m]))[30m:1m])`,
wantMetrics: []string{"http_requests_total"},
wantGroupByColumns: []ColumnInfo{{Name: "job", OriginExpr: "job", OriginField: "job"}}, // Aggregation is inside subquery, not outermost
},
{
name: "P22 - Aggregation inside subquery with max_over_time",
query: `max_over_time(sum by (pod)(rate(container_restarts_total[5m]))[1h:5m])`,
wantMetrics: []string{"container_restarts_total"},
wantGroupByColumns: []ColumnInfo{{Name: "pod", OriginExpr: "pod", OriginField: "pod"}}, // Aggregation is inside subquery, not outermost
},
{
name: "P23 - Aggregation inside subquery with deriv (no rate)",
query: `deriv(sum by (namespace)(container_memory_working_set_bytes)[1h:10m])`,
wantMetrics: []string{"container_memory_working_set_bytes"},
wantGroupByColumns: []ColumnInfo{{Name: "namespace", OriginExpr: "namespace", OriginField: "namespace"}}, // Aggregation is inside subquery, not outermost
},
{
name: "P24 - Aggregation inside subquery with histogram_quantile",
query: `histogram_quantile(0.95, avg_over_time(sum by (le, service)(rate(http_request_duration_seconds_bucket[5m]))[1h:5m]))`,
wantMetrics: []string{"http_request_duration_seconds_bucket"},
wantGroupByColumns: []ColumnInfo{{Name: "le", OriginExpr: "le", OriginField: "le"}, {Name: "service", OriginExpr: "service", OriginField: "service"}}, // Aggregation is inside subquery, not outermost
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := extractor.Extract(tt.query)
// Check error expectation
if tt.wantError {
if err == nil {
t.Errorf("Extract() expected error but got none, query: %s", tt.query)
}
return
}
if err != nil {
t.Errorf("Extract() unexpected error = %v, query: %s", err, tt.query)
return
}
// Sort for comparison
gotMetrics := sortStrings(result.MetricNames)
wantMetrics := sortStrings(tt.wantMetrics)
if !reflect.DeepEqual(gotMetrics, wantMetrics) {
t.Errorf("Extract() MetricNames = %v, want %v", gotMetrics, wantMetrics)
}
// Test GroupByColumns - need to normalize for comparison (order may vary)
gotGroupByColumns := sortColumnInfo(result.GroupByColumns)
wantGroupByColumns := sortColumnInfo(tt.wantGroupByColumns)
if !reflect.DeepEqual(gotGroupByColumns, wantGroupByColumns) {
t.Errorf("Extract() GroupByColumns = %v, want %v", gotGroupByColumns, wantGroupByColumns)
}
})
}
}

View File

@@ -0,0 +1,58 @@
// Package queryfilterextractor provides utilities for extracting metric names
// and grouping keys.
//
// This is useful for metrics discovery, and query analysis.
package queryfilterextractor
import "github.com/SigNoz/signoz/pkg/errors"
const (
ExtractorCH = "qfe_ch"
ExtractorPromQL = "qfe_promql"
)
// ColumnInfo represents a column in the query
type ColumnInfo struct {
Name string
Alias string
OriginExpr string
OriginField string
}
// GroupName returns the field name in the resulting data which is used for grouping
//
// - examples:
//
// - SELECT region as new_region FROM metrics WHERE metric_name='cpu' GROUP BY region
// GroupName() will return "new_region"
//
// - SELECT region FROM metrics WHERE metric_name='cpu' GROUP BY region
// GroupName() will return "region"
func (c *ColumnInfo) GroupName() string {
if c.Alias != "" {
return c.Alias
}
return c.Name
}
type FilterResult struct {
// MetricNames are the metrics that are being filtered on
MetricNames []string
// GroupByColumns are the columns that are being grouped by
GroupByColumns []ColumnInfo
}
type FilterExtractor interface {
Extract(query string) (*FilterResult, error)
}
func NewExtractor(extractorType string) (FilterExtractor, error) {
switch extractorType {
case ExtractorCH:
return NewClickHouseFilterExtractor(), nil
case ExtractorPromQL:
return NewPromQLFilterExtractor(), nil
default:
return nil, errors.Newf(errors.TypeInvalidInput, errors.CodeInvalidInput, "invalid extractor type: %s", extractorType)
}
}