chore: metric name and group by extractor with CH and PromQL support (#9543)

2025-11-20 22:58:16 +05:30
parent 096e38ee91
commit 09cbe4aa0d
9 changed files with 2889 additions and 3 deletions
--- a/go.mod
+++ b/go.mod
@@ -4,7 +4,7 @@ go 1.24.0

 require (
 	dario.cat/mergo v1.0.1
-	github.com/AfterShip/clickhouse-sql-parser v0.4.11
+	github.com/AfterShip/clickhouse-sql-parser v0.4.16
 	github.com/ClickHouse/clickhouse-go/v2 v2.40.1
 	github.com/DATA-DOG/go-sqlmock v1.5.2
 	github.com/SigNoz/govaluate v0.0.0-20240203125216-988004ccc7fd
--- a/go.sum
+++ b/go.sum
@@ -66,8 +66,8 @@ dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk=
 dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
 filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
 filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
-github.com/AfterShip/clickhouse-sql-parser v0.4.11 h1:fZMKAjRmgzW44+hEhF6ywi4VjFZQjJ8QrFBbgBsjmF4=
-github.com/AfterShip/clickhouse-sql-parser v0.4.11/go.mod h1:W0Z82wJWkJxz2RVun/RMwxue3g7ut47Xxl+SFqdJGus=
+github.com/AfterShip/clickhouse-sql-parser v0.4.16 h1:gpl+wXclYUKT0p4+gBq22XeRYWwEoZ9f35vogqMvkLQ=
+github.com/AfterShip/clickhouse-sql-parser v0.4.16/go.mod h1:W0Z82wJWkJxz2RVun/RMwxue3g7ut47Xxl+SFqdJGus=
 github.com/Azure/azure-sdk-for-go v68.0.0+incompatible h1:fcYLmCpyNYRnvJbPerq7U0hS+6+I79yEDJBqVNcqUzU=
 github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0 h1:Gt0j3wceWMwPmiazCa8MzMA0MfhmPIz0Qp0FJ6qcM0U=
 github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0/go.mod h1:Ot/6aikWnKWi4l9QB7qVSwa8iMphQNqkWALMoNT3rzM=
--- a/pkg/parser/queryfilterextractor/clickhouse.go
+++ b/pkg/parser/queryfilterextractor/clickhouse.go
@@ -0,0 +1,687 @@
+package queryfilterextractor
+
+import (
+	"fmt"
+	"strings"
+
+	clickhouse "github.com/AfterShip/clickhouse-sql-parser/parser"
+	"github.com/SigNoz/signoz/pkg/errors"
+)
+
+const (
+	// MetricNameColumn is the column name used for filtering metrics
+	MetricNameColumn = "metric_name"
+)
+
+// ClickHouseFilterExtractor extracts metric names and grouping keys from ClickHouse SQL queries
+type ClickHouseFilterExtractor struct{}
+
+// NewClickHouseFilterExtractor creates a new ClickHouse filter extractor
+func NewClickHouseFilterExtractor() *ClickHouseFilterExtractor {
+	return &ClickHouseFilterExtractor{}
+}
+
+// Extract parses a ClickHouse query and extracts metric names and grouping keys
+func (e *ClickHouseFilterExtractor) Extract(query string) (*FilterResult, error) {
+	p := clickhouse.NewParser(query)
+	stmts, err := p.ParseStmts()
+	if err != nil {
+		return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "failed to parse clickhouse query: %s", err.Error())
+	}
+
+	result := &FilterResult{MetricNames: []string{}, GroupByColumns: []ColumnInfo{}}
+
+	metricNames := make(map[string]bool)
+
+	// Track top-level queries for GROUP BY extraction
+	topLevelQueries := make(map[*clickhouse.SelectQuery]bool)
+
+	// Process all statements
+	for _, stmt := range stmts {
+		selectQuery, ok := stmt.(*clickhouse.SelectQuery)
+		if !ok {
+			continue
+		}
+
+		// Mark as top-level
+		topLevelQueries[selectQuery] = true
+
+		// Walk the AST to extract metrics
+		clickhouse.Walk(selectQuery, func(node clickhouse.Expr) bool {
+			e.fillMetricNamesFromExpr(node, metricNames)
+			return true // Continue traversal
+		})
+	}
+
+	// Extract GROUP BY from the top-level queries by first building a map of CTEs and
+	// then recursively extracting the GROUP BY from the CTEs and subqueries.
+
+	// Build CTE map for all top-level queries
+	cteMap := make(map[string]*clickhouse.SelectQuery)
+	for query := range topLevelQueries {
+		e.buildCTEMap(query, cteMap)
+	}
+
+	// Extract GROUP BY with aliases and origins from the CTEs and subqueries using recursive approach
+	// Use a map to handle duplicates (last ColumnInfo wins across queries)
+	groupByColumnsMap := make(map[string]ColumnInfo) // column name -> ColumnInfo
+	visited := make(map[*clickhouse.SelectQuery]bool)
+	for query := range topLevelQueries {
+		columns, err := e.extractGroupByColumns(query, cteMap, visited)
+		if err != nil {
+			return nil, err
+		}
+		for _, col := range columns {
+			// Last column info wins for duplicate columns across multiple queries
+			groupByColumnsMap[col.Name] = col
+		}
+	}
+
+	// Convert sets to slices
+	for metric := range metricNames {
+		result.MetricNames = append(result.MetricNames, metric)
+	}
+
+	// Build GroupByColumns from the map
+	for _, colInfo := range groupByColumnsMap {
+		result.GroupByColumns = append(result.GroupByColumns, colInfo)
+	}
+
+	return result, nil
+}
+
+// ========================================
+// Metric Name Extraction
+// ========================================
+
+// fillMetricNamesFromExpr extracts metric names from various node types
+func (e *ClickHouseFilterExtractor) fillMetricNamesFromExpr(node clickhouse.Expr, metricNames map[string]bool) {
+
+	switch n := node.(type) {
+	case *clickhouse.BinaryOperation:
+		e.fillMetricFromBinaryOp(n, metricNames)
+	}
+}
+
+// fillMetricFromBinaryOp extracts metrics from binary operations
+func (e *ClickHouseFilterExtractor) fillMetricFromBinaryOp(op *clickhouse.BinaryOperation, metricNames map[string]bool) {
+	// Check if left side is metric_name column
+	leftCol := e.getColumnName(op.LeftExpr)
+	rightCol := e.getColumnName(op.RightExpr)
+
+	// Handle metric_name on left side: metric_name = 'value'
+	if leftCol == MetricNameColumn {
+		e.fillMetricWithBinaryOpConditions(op, op.RightExpr, metricNames)
+		return
+	}
+
+	// Handle metric_name on right side: 'value' = metric_name
+	if rightCol == MetricNameColumn {
+		e.fillMetricWithBinaryOpConditions(op, op.LeftExpr, metricNames)
+		return
+	}
+}
+
+// fillMetricWithBinaryOpConditions extracts metric names from the value side of a binary operation
+//
+// Supported operators:
+//   - "=", "==": Extracts literal string values or values from any() function
+//   - "IN", "GLOBAL IN": Extracts all literal string values from the list
+//
+// Unsupported operators (can be added later if needed):
+//   - "!=", "<>", "NOT IN": Negative filters. (e.g., metric_name != 'a')
+//   - "LIKE", "ILIKE": Pattern matching filters
+//   - "NOT LIKE", "NOT ILIKE": Negative pattern matching filters
+//   - "OR", "AND": Boolean operators as the Walk function will automatically traverse both sides
+//     of OR/AND operations and extract metrics from each branch. (e.g., metric_name='a' OR metric_name='b')
+func (e *ClickHouseFilterExtractor) fillMetricWithBinaryOpConditions(op *clickhouse.BinaryOperation, valueExpr clickhouse.Expr, metricNames map[string]bool) {
+	switch op.Operation {
+	case clickhouse.TokenKindSingleEQ, clickhouse.TokenKindDoubleEQ:
+		// metric_name = 'value' or metric_name = any(['a', 'b'])
+		// Skip if value side is a function call (function-wrapped literals are ignored, test case: CH59)
+		if fn, ok := valueExpr.(*clickhouse.FunctionExpr); ok {
+			// Only handle any() function, skip others like lowercase('cpu')
+			if fn.Name != nil && fn.Name.Name == "any" {
+				e.extractInValues(valueExpr, metricNames)
+			}
+			// Otherwise skip function-wrapped literals
+		} else if val := e.extractStringLiteral(valueExpr); val != "" {
+			metricNames[val] = true
+		}
+	case "IN", "GLOBAL IN":
+		// metric_name IN ('a', 'b', 'c')
+		// GLOBAL IN behaves the same as IN for metric extraction purposes
+		// Skip if value side is a function call (function-wrapped literals are ignored, test case: CH59)
+		if _, ok := valueExpr.(*clickhouse.FunctionExpr); !ok {
+			e.extractInValues(valueExpr, metricNames)
+		}
+	}
+}
+
+// extractStringLiteral extracts a string literal value from an expression
+func (e *ClickHouseFilterExtractor) extractStringLiteral(expr clickhouse.Expr) string {
+	switch ex := expr.(type) {
+	case *clickhouse.StringLiteral:
+		return ex.Literal
+	}
+	return ""
+}
+
+// extractInValues extracts values from IN expressions
+func (e *ClickHouseFilterExtractor) extractInValues(expr clickhouse.Expr, metricNames map[string]bool) {
+	// Find all string literals in the expression
+	strLits := clickhouse.FindAll(expr, func(node clickhouse.Expr) bool {
+		// metric_name passed in `in` condition will be string literal.
+		_, ok := node.(*clickhouse.StringLiteral)
+		return ok
+	})
+
+	for _, strLitNode := range strLits {
+		if strLit, ok := strLitNode.(*clickhouse.StringLiteral); ok {
+			// Unquote the string literal
+			val := e.extractStringLiteral(strLit)
+			if val != "" {
+				metricNames[val] = true
+			}
+		}
+	}
+}
+
+// ========================================
+// GROUP BY Column Extraction
+// ========================================
+
+// extractGroupByColumns extracts the GROUP BY columns from a query
+// It follows the top-down approach where outer GROUP BY overrides inner GROUP BY in subqueries and CTEs.
+// Returns a slice of ColumnInfo with column names, aliases, and origins
+func (e *ClickHouseFilterExtractor) extractGroupByColumns(query *clickhouse.SelectQuery, cteMap map[string]*clickhouse.SelectQuery, visited map[*clickhouse.SelectQuery]bool) ([]ColumnInfo, error) {
+	if visited[query] {
+		return nil, nil
+	}
+
+	// Mark this query as visited to prevent cycles
+	visited[query] = true
+
+	// First, check if this query has its own GROUP BY using direct field access
+	hasGroupBy := query.GroupBy != nil
+
+	// If this query has GROUP BY, use it (outer overrides inner)
+	if hasGroupBy {
+		// Extract GROUP BY columns
+		tempGroupBy := make(map[string]bool)
+		e.fillGroupsFromGroupByClause(query.GroupBy, tempGroupBy)
+
+		// Extract SELECT columns and their aliases from the same query level
+		selectAliases := e.extractSelectColumns(query)
+
+		// Build ColumnInfo array by matching GROUP BY with SELECT aliases and origins
+		result := []ColumnInfo{}
+
+		for groupByCol := range tempGroupBy {
+			alias := selectAliases[groupByCol] // Will be "" if not in SELECT
+
+			// Extract originExpr by tracing back through queries
+			originVisited := make(map[*clickhouse.SelectQuery]bool)
+			originExpr := e.extractColumnOrigin(groupByCol, query, cteMap, originVisited)
+			originField, err := extractCHOriginFieldFromQuery(fmt.Sprintf("SELECT %s", originExpr))
+			if err != nil {
+				return nil, err
+			}
+
+			result = append(result, ColumnInfo{
+				Name:        groupByCol,
+				Alias:       alias,
+				OriginExpr:  originExpr,
+				OriginField: originField,
+			})
+		}
+		return result, nil
+	}
+
+	// If no GROUP BY in this query, follow CTE/subquery references
+	// It might have grouping inside the CTE/subquery
+	sourceQuery := e.extractSourceQuery(query, cteMap)
+	if sourceQuery != nil {
+		return e.extractGroupByColumns(sourceQuery, cteMap, visited)
+	}
+
+	return nil, nil
+}
+
+// fillGroupsFromGroupByClause extracts GROUP BY columns from a specific GroupByClause and fills the map with the column names
+func (e *ClickHouseFilterExtractor) fillGroupsFromGroupByClause(groupByClause *clickhouse.GroupByClause, groupBy map[string]bool) {
+
+	// Extract GROUP BY expressions properly
+	// Find only the direct child ColumnExprList, not nested ones
+	// We use Find instead of FindAll to get only the first (direct child) ColumnExprList
+	exprListNode, foundList := clickhouse.Find(groupByClause, func(node clickhouse.Expr) bool {
+		_, ok := node.(*clickhouse.ColumnExprList)
+		return ok
+	})
+
+	if !foundList {
+		return
+	}
+
+	// Note: We only extract from the top-level ColumnExprList.Items to avoid extracting nested parts
+	// This prevents extracting 'timestamp' from 'toDate(timestamp)' - we only get 'toDate(timestamp)'
+	if exprList, ok := exprListNode.(*clickhouse.ColumnExprList); ok {
+		// Extract each expression from the list - these are top-level only
+		if exprList.Items != nil {
+			for _, item := range exprList.Items {
+				groupKey := e.extractColumnStrByExpr(item)
+				if groupKey != "" {
+					// Strip table alias if present (e.g., "m.region" -> "region")
+					groupKey = e.stripTableAlias(groupKey)
+					groupBy[groupKey] = true
+				}
+			}
+		}
+	}
+
+}
+
+// extractColumnStrByExpr extracts the complete string representation of different expression types
+// Supports:
+//   - Ident: Simple identifier like "region" or "timestamp"
+//   - FunctionExpr: Function call like "toDate(timestamp)"
+//   - ColumnExpr: Column expression like "m.region", "toDate(timestamp)"
+//   - Other expression types: Return the string representation of the expression
+//
+// For example:
+//   - "region" -> "region"
+//   - "toDate(timestamp)" -> "toDate(timestamp)"
+//   - "`m.region`" -> "`m.region`"
+func (e *ClickHouseFilterExtractor) extractColumnStrByExpr(expr clickhouse.Expr) string {
+	if expr == nil {
+		return ""
+	}
+
+	switch ex := expr.(type) {
+	// Ident is a simple identifier like "region" or "timestamp"
+	case *clickhouse.Ident:
+		// Handling for backticks which are native to ClickHouse and used for literal names.
+		// CH Parser removes the backticks from the identifier, so we need to add them back.
+		if ex.QuoteType == clickhouse.BackTicks {
+			return "`" + ex.Name + "`"
+		}
+		return ex.Name
+	// FunctionExpr is a function call like "toDate(timestamp)"
+	case *clickhouse.FunctionExpr:
+		// For function expressions, return the complete function call string
+		return ex.String()
+	// ColumnExpr is a column expression like "m.region", "toDate(timestamp)"
+	case *clickhouse.ColumnExpr:
+		// ColumnExpr wraps another expression - extract the underlying expression
+		if ex.Expr != nil {
+			return e.extractColumnStrByExpr(ex.Expr)
+		}
+		return ex.String()
+	default:
+		// For other expression types, return the string representation
+		return expr.String()
+	}
+}
+
+// stripTableAlias removes table alias prefix from a column name (e.g., "m.region" -> "region")
+// but for literals with backticks, we need preserve the entire string. (e.g., `os.type` -> "os.type")
+func (e *ClickHouseFilterExtractor) stripTableAlias(name string) string {
+	// Handling for backticks which are native to ClickHouse and used for literal names.
+	if strings.HasPrefix(name, "`") && strings.HasSuffix(name, "`") {
+		return strings.Trim(name, "`")
+	}
+
+	// split the name by dot and return the last part
+	parts := strings.Split(name, ".")
+	if len(parts) > 1 {
+		return parts[len(parts)-1]
+	}
+	return name
+}
+
+// getColumnName extracts column name from an expression
+func (e *ClickHouseFilterExtractor) getColumnName(expr clickhouse.Expr) string {
+	switch ex := expr.(type) {
+	case *clickhouse.Ident:
+		return ex.Name
+	case *clickhouse.Path:
+		// Handle Path type for qualified column names like "m.metric_name"
+		// Extract the last field which is the column name
+		if len(ex.Fields) > 0 {
+			return ex.Fields[len(ex.Fields)-1].Name
+		}
+		return ""
+	}
+	return ""
+}
+
+// extractSourceQuery extracts the SelectQuery from FROM expressions
+// Handles CTE references, subqueries, and table expressions
+// For example: from the below query We'll try to extract the name of the source query
+// which in the below case is "aggregated". Once we find it we return the SelectQuery node
+// from the cteMap, which acts as the source for the GROUP BY extraction.
+//
+//	 WITH aggregated AS (
+//		SELECT region as region_alias, sum(value) AS total
+//		FROM metrics
+//		WHERE metric_name = 'cpu_usage'
+//		GROUP BY region
+//	 )
+//	 SELECT * FROM aggregated
+func (e *ClickHouseFilterExtractor) extractSourceQuery(query *clickhouse.SelectQuery, cteMap map[string]*clickhouse.SelectQuery) *clickhouse.SelectQuery {
+	if query.From == nil {
+		return nil
+	}
+
+	// Find the FROM clause and extract the source
+	fromExprs := clickhouse.FindAll(query.From, func(node clickhouse.Expr) bool {
+		switch node.(type) {
+		case *clickhouse.Ident, *clickhouse.SelectQuery:
+			return true
+		}
+		return false
+	})
+
+	for _, fromExpr := range fromExprs {
+		switch expr := fromExpr.(type) {
+		case *clickhouse.Ident:
+			// CTE reference by simple name
+			if cteQuery, exists := cteMap[expr.Name]; exists {
+				return cteQuery
+			}
+		case *clickhouse.SelectQuery:
+			// Direct subquery
+			return expr
+		}
+	}
+
+	return nil
+}
+
+// ========================================
+// Column Origin Tracing
+// ========================================
+
+// extractColumnOrigin recursively traces a column back to its original expression
+// Returns the original expression string (e.g., "JSONExtractString(labels, 'service.name')")
+// or the column name itself if it's a direct column reference
+func (e *ClickHouseFilterExtractor) extractColumnOrigin(
+	columnName string,
+	query *clickhouse.SelectQuery,
+	cteMap map[string]*clickhouse.SelectQuery,
+	visited map[*clickhouse.SelectQuery]bool,
+) string {
+	if query == nil {
+		return columnName
+	}
+
+	// Prevent infinite recursion and redundant work
+	// Once a query is visited, we don't need to check it again
+	if visited[query] {
+		return columnName
+	}
+	visited[query] = true
+	// this is to prevent infinite recursion in a single query search
+	// but we don't want this to affect the other queries searches
+	// so we delete it after the search is done for current query
+	defer delete(visited, query)
+
+	// Step 1: Search in CTE and Joins, this will take us to very end of the SubQueries and CTE
+	sourceQuery := e.extractSourceQuery(query, cteMap)
+	if sourceQuery != nil {
+		returningOrigin := e.extractColumnOrigin(columnName, sourceQuery, cteMap, visited)
+		if returningOrigin != columnName {
+			return returningOrigin
+		}
+	}
+
+	// Step 2: Once we're sure there are no SubQueries and CTE we just find all the selectItem
+	// and then get their column origin values
+	selectItems := clickhouse.FindAll(query, func(node clickhouse.Expr) bool {
+		_, ok := node.(*clickhouse.SelectItem)
+		return ok
+	})
+
+	// extractOriginFromSelectItem extracts the origin from a SelectItem
+	extractOriginFromSelectItem := func(selectItem *clickhouse.SelectItem) *string {
+		// Check if this SelectItem matches our column (by alias or by name)
+		alias := e.extractSelectItemAlias(selectItem)
+		exprStr := e.extractSelectItemName(selectItem)
+		normalizedExpr := e.stripTableAlias(exprStr)
+
+		// Case 1: Column matches an alias in SELECT
+		if alias == columnName {
+			// This is an alias - get the expression it's aliasing
+			if selectItem.Expr != nil {
+				originExpr := e.extractFullExpression(selectItem.Expr)
+				// If the expression is just a column name, trace it back further
+				if normalizedExpr == columnName || e.isSimpleColumnReference(selectItem.Expr) {
+					// It's referencing another column - trace back through source query
+					sourceQuery := e.extractSourceQuery(query, cteMap)
+					if sourceQuery != nil {
+						originExpr := e.extractColumnOrigin(normalizedExpr, sourceQuery, cteMap, visited)
+						return &originExpr
+					}
+				}
+				return &originExpr
+			}
+		}
+
+		// Case 2: Column matches the expression itself (no alias)
+		if normalizedExpr == columnName {
+			// Check if this is a simple column reference or a complex expression
+			if e.isSimpleColumnReference(selectItem.Expr) {
+				// Simple column - trace back through source query
+				sourceQuery := e.extractSourceQuery(query, cteMap)
+				if sourceQuery != nil {
+					originExpr := e.extractColumnOrigin(columnName, sourceQuery, cteMap, visited)
+					return &originExpr
+				}
+				return &columnName
+			} else {
+				// Complex expression - return it as origin
+				originExpr := e.extractFullExpression(selectItem.Expr)
+				return &originExpr
+			}
+		}
+		return nil
+	}
+
+	var finalColumnOrigin string
+	for _, itemNode := range selectItems {
+		if selectItem, ok := itemNode.(*clickhouse.SelectItem); ok {
+			// We call the extractOriginFromSelectItem function for each SelectItem
+			// and if the origin is not nil, we set the finalColumnOrigin to the origin
+			// this has to be done to get to the most nested origin of column where selectItem is present
+			origin := extractOriginFromSelectItem(selectItem)
+			if origin != nil {
+				finalColumnOrigin = *origin
+			}
+		}
+	}
+	if finalColumnOrigin != "" {
+		return finalColumnOrigin
+	}
+
+	return columnName
+}
+
+// extractFullExpression extracts the complete string representation of an expression
+func (e *ClickHouseFilterExtractor) extractFullExpression(expr clickhouse.Expr) string {
+	if expr == nil {
+		return ""
+	}
+	return expr.String()
+}
+
+// isSimpleColumnReference checks if an expression is just a simple column reference
+// (not a function call or complex expression)
+func (e *ClickHouseFilterExtractor) isSimpleColumnReference(expr clickhouse.Expr) bool {
+	if expr == nil {
+		return false
+	}
+	switch ex := expr.(type) {
+	case *clickhouse.Ident:
+		// backticks are treated as non simple column reference
+		// so that we can return the origin expression with backticks
+		// origin parser will handle the backticks and extract the column name from it
+		if ex.QuoteType == clickhouse.BackTicks {
+			return false
+		}
+		return true
+	case *clickhouse.Path:
+		return true
+	case *clickhouse.ColumnExpr:
+		// Check if it wraps a simple reference
+		if ex.Expr != nil {
+			return e.isSimpleColumnReference(ex.Expr)
+		}
+	}
+	return false
+}
+
+// ========================================
+// SELECT Column Alias Extraction
+// ========================================
+
+// extractSelectColumns extracts column names and their aliases from SELECT clause of a specific query
+// Returns a map where key is normalized column name and value is the alias
+// For duplicate columns with different aliases, the last alias wins
+// This follows the same pattern as extractGroupFromGroupByClause - finding direct children only
+func (e *ClickHouseFilterExtractor) extractSelectColumns(query *clickhouse.SelectQuery) map[string]string {
+	aliasMap := make(map[string]string)
+
+	if query == nil {
+		return aliasMap
+	}
+
+	// Find SelectItem nodes which represent columns in the SELECT clause
+	// SelectItem has an Expr field (the column/expression) and an Alias field
+	selectItems := clickhouse.FindAll(query, func(node clickhouse.Expr) bool {
+		_, ok := node.(*clickhouse.SelectItem)
+		return ok
+	})
+
+	// Process each SelectItem and extract column name and alias
+	for _, itemNode := range selectItems {
+		if selectItem, ok := itemNode.(*clickhouse.SelectItem); ok {
+			// Extract the column name/expression from SelectItem.Expr
+			columnName := e.extractSelectItemName(selectItem)
+			if columnName == "" {
+				continue
+			}
+
+			// Normalize column name (strip table alias)
+			normalizedName := e.stripTableAlias(columnName)
+
+			// Extract alias from SelectItem.Alias
+			alias := e.extractSelectItemAlias(selectItem)
+
+			// Store in map - last alias wins for duplicates
+			aliasMap[normalizedName] = alias
+		}
+	}
+
+	return aliasMap
+}
+
+// extractSelectItemName extracts the column name or expression from a SelectItem
+func (e *ClickHouseFilterExtractor) extractSelectItemName(selectItem *clickhouse.SelectItem) string {
+	if selectItem == nil || selectItem.Expr == nil {
+		return ""
+	}
+
+	return e.extractColumnStrByExpr(selectItem.Expr)
+}
+
+// extractSelectItemAlias extracts the alias from a SelectItem
+// Returns empty string if no alias is present
+func (e *ClickHouseFilterExtractor) extractSelectItemAlias(selectItem *clickhouse.SelectItem) string {
+	if selectItem == nil || selectItem.Alias == nil {
+		return ""
+	}
+
+	// The Alias field is an *Ident (pointer type)
+	if selectItem.Alias.Name != "" {
+		return selectItem.Alias.Name
+	}
+
+	return ""
+}
+
+// ========================================
+// CTE and Subquery Extraction
+// ========================================
+
+// buildCTEMap builds a map of CTE names to their SelectQuery nodes by recursively
+// traversing all queries and their nested expressions
+func (e *ClickHouseFilterExtractor) buildCTEMap(query *clickhouse.SelectQuery, cteMap map[string]*clickhouse.SelectQuery) {
+
+	// Access CTEs directly from WithClause if it exists
+	if query.With != nil && query.With.CTEs != nil {
+		for _, cte := range query.With.CTEs {
+			cteName := e.extractCTEName(cte)
+			cteQuery := e.extractCTEQuery(cte)
+			if cteName != "" && cteQuery != nil {
+				cteMap[cteName] = cteQuery
+				// Recursively build CTE map for nested CTEs
+				e.buildCTEMap(cteQuery, cteMap)
+			}
+		}
+	}
+
+	// Also check for CTEs in subqueries and other expressions
+	e.buildCTEMapFromExpr(query, cteMap)
+}
+
+// extractCTEName extracts the CTE name from a CTEStmt, the Expr field is the name of the CTE
+func (e *ClickHouseFilterExtractor) extractCTEName(cte *clickhouse.CTEStmt) string {
+	if cte == nil || cte.Expr == nil {
+		return ""
+	}
+
+	switch name := cte.Expr.(type) {
+	case *clickhouse.Ident:
+		return name.Name
+	default:
+		return cte.Expr.String()
+	}
+}
+
+// extractCTEQuery extracts the SelectQuery from a CTEStmt, the Alias field is the SelectQuery
+func (e *ClickHouseFilterExtractor) extractCTEQuery(cte *clickhouse.CTEStmt) *clickhouse.SelectQuery {
+	if cte == nil || cte.Alias == nil {
+		return nil
+	}
+
+	// The Alias field should contain a SelectQuery
+	if selectQuery, ok := cte.Alias.(*clickhouse.SelectQuery); ok {
+		return selectQuery
+	}
+
+	return nil
+}
+
+// buildCTEMapFromExpr recursively extracts CTEs from various expression types
+func (e *ClickHouseFilterExtractor) buildCTEMapFromExpr(expr clickhouse.Expr, cteMap map[string]*clickhouse.SelectQuery) {
+
+	// Walk through all nodes to find SelectQuery nodes that might contain CTEs
+	clickhouse.Walk(expr, func(node clickhouse.Expr) bool {
+		switch n := node.(type) {
+		case *clickhouse.SelectQuery:
+			// Don't process the same query we started with to avoid infinite recursion
+			if n != expr {
+				e.buildCTEMap(n, cteMap)
+			}
+		case *clickhouse.TableExpr:
+			if n.Expr != nil {
+				e.buildCTEMapFromExpr(n.Expr, cteMap)
+			}
+		case *clickhouse.JoinTableExpr:
+			if n.Table != nil {
+				e.buildCTEMapFromExpr(n.Table, cteMap)
+			}
+		}
+		return true // Continue traversal
+	})
+}
--- a/pkg/parser/queryfilterextractor/clickhouse_originparser.go
+++ b/pkg/parser/queryfilterextractor/clickhouse_originparser.go
@@ -0,0 +1,305 @@
+package queryfilterextractor
+
+import (
+	"strings"
+
+	"github.com/AfterShip/clickhouse-sql-parser/parser"
+	"github.com/SigNoz/signoz/pkg/errors"
+)
+
+// excludedFunctions contains functions that should cause ExtractOriginField to return empty string.
+// Map key is the function name in lowercase, value is the original function name.
+var excludedFunctions = map[string]string{
+	// Time functions
+	"now":                     "now",
+	"today":                   "today",
+	"yesterday":               "yesterday",
+	"todatetime":              "toDateTime",
+	"todatetime64":            "toDateTime64",
+	"todate":                  "toDate",
+	"todate32":                "toDate32",
+	"tostartofinterval":       "toStartOfInterval",
+	"tostartofday":            "toStartOfDay",
+	"tostartofweek":           "toStartOfWeek",
+	"tostartofmonth":          "toStartOfMonth",
+	"tostartofquarter":        "toStartOfQuarter",
+	"tostartofyear":           "toStartOfYear",
+	"tostartofhour":           "toStartOfHour",
+	"tostartofminute":         "toStartOfMinute",
+	"tostartofsecond":         "toStartOfSecond",
+	"tostartoffiveminutes":    "toStartOfFiveMinutes",
+	"tostartoftenminutes":     "toStartOfTenMinutes",
+	"tostartoffifteenminutes": "toStartOfFifteenMinutes",
+	"tointervalsecond":        "toIntervalSecond",
+	"tointervalminute":        "toIntervalMinute",
+	"tointervalhour":          "toIntervalHour",
+	"tointervalday":           "toIntervalDay",
+	"tointervalweek":          "toIntervalWeek",
+	"tointervalmonth":         "toIntervalMonth",
+	"tointervalquarter":       "toIntervalQuarter",
+	"tointervalyear":          "toIntervalYear",
+	"parsedatetime":           "parseDateTime",
+	"parsedatetimebesteffort": "parseDateTimeBestEffort",
+
+	// Aggregate functions
+	"count":          "count",
+	"sum":            "sum",
+	"avg":            "avg",
+	"min":            "min",
+	"max":            "max",
+	"any":            "any",
+	"stddevpop":      "stddevPop",
+	"stddevsamp":     "stddevSamp",
+	"varpop":         "varPop",
+	"varsamp":        "varSamp",
+	"grouparray":     "groupArray",
+	"groupuniqarray": "groupUniqArray",
+	"quantile":       "quantile",
+	"quantiles":      "quantiles",
+	"quantileexact":  "quantileExact",
+	"quantiletiming": "quantileTiming",
+	"median":         "median",
+	"uniq":           "uniq",
+	"uniqexact":      "uniqExact",
+	"uniqcombined":   "uniqCombined",
+	"uniqhll12":      "uniqHLL12",
+	"topk":           "topK",
+	"first":          "first",
+	"last":           "last",
+}
+
+// jsonExtractFunctions contains functions that extract from JSON columns.
+// Map key is the function name in lowercase, value is the original function name.
+var jsonExtractFunctions = map[string]string{
+	"jsonextractstring":        "JSONExtractString",
+	"jsonextractint":           "JSONExtractInt",
+	"jsonextractuint":          "JSONExtractUInt",
+	"jsonextractfloat":         "JSONExtractFloat",
+	"jsonextractbool":          "JSONExtractBool",
+	"jsonextract":              "JSONExtract",
+	"jsonextractraw":           "JSONExtractRaw",
+	"jsonextractarrayraw":      "JSONExtractArrayRaw",
+	"jsonextractkeysandvalues": "JSONExtractKeysAndValues",
+}
+
+// isFunctionPresentInStore checks if a function name exists in the function store map
+func isFunctionPresentInStore(funcName string, funcStore map[string]string) bool {
+	_, exists := funcStore[strings.ToLower(funcName)]
+	return exists
+}
+
+// isReservedSelectKeyword checks if a keyword is a reserved keyword for the SELECT statement
+// We're only including those which can appear in the SELECT statement without being quoted
+func isReservedSelectKeyword(keyword string) bool {
+	return strings.ToUpper(keyword) == parser.KeywordSelect || strings.ToUpper(keyword) == parser.KeywordFrom
+}
+
+// extractCHOriginField extracts the origin field (column name) from a query string
+// or fields getting extracted in case of JSON extraction functions.
+func extractCHOriginFieldFromQuery(query string) (string, error) {
+	// Parse the query string
+	p := parser.NewParser(query)
+	stmts, err := p.ParseStmts()
+	if err != nil {
+		return "", errors.NewInternalf(errors.CodeInternal, "failed to parse origin field from query: %s", err.Error())
+	}
+
+	// Get the first statement which should be a SELECT
+	selectStmt := stmts[0].(*parser.SelectQuery)
+
+	// If query has multiple select items, return blank string as we don't expect multiple select items
+	if len(selectStmt.SelectItems) > 1 {
+		return "", nil
+	}
+
+	if len(selectStmt.SelectItems) == 0 {
+		return "", errors.NewInternalf(errors.CodeInternal, "SELECT query has no select items")
+	}
+
+	// Extract origin field from the first (and only) select item's expression
+	return extractOriginFieldFromExpr(selectStmt.SelectItems[0].Expr)
+}
+
+// extractOriginFieldFromExpr extracts the origin field (column name) from an expression.
+// This is the internal helper function that contains the original logic.
+func extractOriginFieldFromExpr(expr parser.Expr) (string, error) {
+	// Check if expression contains excluded functions or IF/CASE
+	hasExcludedExpressions := false
+	hasReservedKeyword := false
+
+	parser.Walk(expr, func(node parser.Expr) bool {
+		// exclude reserved keywords because the parser will treat them as valid SQL
+		// example: SELECT FROM table here the "FROM" is a reserved keyword,
+		// but the parser will treat it as valid column to be extracted.
+		if ident, ok := node.(*parser.Ident); ok {
+			if ident.QuoteType == parser.Unquoted && isReservedSelectKeyword(ident.Name) {
+				hasReservedKeyword = true
+				return false
+			}
+		}
+		// for functions, we need to check if the function is excluded function or a JSON extraction function with nested JSON extraction
+		if funcExpr, ok := node.(*parser.FunctionExpr); ok {
+			if isFunctionPresentInStore(funcExpr.Name.Name, excludedFunctions) {
+				hasExcludedExpressions = true
+				return false
+			}
+			// Check for nested JSON extraction functions
+			if isFunctionPresentInStore(funcExpr.Name.Name, jsonExtractFunctions) {
+				// Check if any argument contains another JSON extraction function
+				if funcExpr.Params != nil && funcExpr.Params.Items != nil {
+					for _, arg := range funcExpr.Params.Items.Items {
+						if containsJSONExtractFunction(arg) {
+							hasExcludedExpressions = true
+							return false
+						}
+					}
+				}
+			}
+		}
+		if _, ok := node.(*parser.CaseExpr); ok {
+			hasExcludedExpressions = true
+			return false
+		}
+		return true
+	})
+
+	// If the expression contains reserved keywords, return error
+	if hasReservedKeyword {
+		return "", errors.New(errors.TypeUnsupported, errors.CodeUnsupported, "reserved keyword found in select clause")
+	}
+
+	// If the expression contains excluded expressions, return empty string
+	if hasExcludedExpressions {
+		return "", nil
+	}
+
+	// Extract all column names from the expression
+	columns := extractColumns(expr)
+
+	// If we found exactly one unique column, return it
+	if len(columns) == 1 {
+		return columns[0], nil
+	}
+
+	// Multiple columns or no columns - return empty string
+	return "", nil
+}
+
+// containsJSONExtractFunction checks if an expression contains a JSON extraction function
+func containsJSONExtractFunction(expr parser.Expr) bool {
+	if expr == nil {
+		return false
+	}
+
+	found := false
+	parser.Walk(expr, func(node parser.Expr) bool {
+		if funcExpr, ok := node.(*parser.FunctionExpr); ok {
+			if isFunctionPresentInStore(funcExpr.Name.Name, jsonExtractFunctions) {
+				found = true
+				return false
+			}
+		}
+		return true
+	})
+
+	return found
+}
+
+// extractColumns recursively extracts all unique column names from an expression.
+// Note: String literals are also considered as origin fields and will be included in the result.
+func extractColumns(expr parser.Expr) []string {
+
+	columnMap := make(map[string]bool)
+	extractColumnsHelper(expr, columnMap)
+
+	// Convert map to slice
+	columns := make([]string, 0, len(columnMap))
+	for col := range columnMap {
+		columns = append(columns, col)
+	}
+
+	return columns
+}
+
+// extractColumnsHelper is a recursive helper that finds all column references.
+// Note: String literals are also considered as origin fields and will be added to the columnMap.
+func extractColumnsHelper(expr parser.Expr, columnMap map[string]bool) {
+	switch n := expr.(type) {
+	// Ident is a simple identifier like "region" or "timestamp"
+	case *parser.Ident:
+		// Add identifiers as column references
+		columnMap[n.Name] = true
+
+	// FunctionExpr is a function call like "toDate(timestamp)", "JSONExtractString(labels, 'service.name')"
+	case *parser.FunctionExpr:
+		// Special handling for JSON extraction functions
+		// In case of nested JSON extraction, we return blank values (handled at top level)
+		if isFunctionPresentInStore(n.Name.Name, jsonExtractFunctions) {
+			// For JSON functions, extract from the second argument (the JSON path/key being extracted)
+			// The first argument is the column name, the second is the exact data being extracted
+			// The extracted data (second argument) is treated as the origin field
+			if n.Params != nil && n.Params.Items != nil && len(n.Params.Items.Items) >= 2 {
+				secondArg := n.Params.Items.Items[1]
+				// If the second argument is a string literal, use its value as the origin field
+				// String literals are considered as origin fields
+				if strLit, ok := secondArg.(*parser.StringLiteral); ok {
+					columnMap[strLit.Literal] = true
+				} else {
+					// Otherwise, try to extract columns from it
+					extractColumnsHelper(secondArg, columnMap)
+				}
+			}
+			return
+		}
+
+		// For regular functions, recursively process all arguments, ex: lower(name)
+		if n.Params != nil && n.Params.Items != nil {
+			for _, item := range n.Params.Items.Items {
+				extractColumnsHelper(item, columnMap)
+			}
+		}
+
+	// BinaryOperation is a binary operation like "region = 'us-east-1'" or "unix_milli / 1000"
+	case *parser.BinaryOperation:
+		extractColumnsHelper(n.LeftExpr, columnMap)
+		extractColumnsHelper(n.RightExpr, columnMap)
+
+	// ColumnExpr is a column expression like "m.region", "service.name"
+	case *parser.ColumnExpr:
+		extractColumnsHelper(n.Expr, columnMap)
+
+	// CastExpr is a cast expression like "CAST(unix_milli AS String)"
+	case *parser.CastExpr:
+		extractColumnsHelper(n.Expr, columnMap)
+
+	case *parser.ParamExprList:
+		if n.Items != nil {
+			extractColumnsHelper(n.Items, columnMap)
+		}
+
+		// Ex: coalesce(cpu_usage, 0) + coalesce(mem_usage, 0)
+	case *parser.ColumnExprList:
+		for _, item := range n.Items {
+			extractColumnsHelper(item, columnMap)
+		}
+
+	// StringLiteral is a string literal like "us-east-1" or "cpu.usage"
+	case *parser.StringLiteral:
+		// String literals are considered as origin fields
+		columnMap[n.Literal] = true
+		return
+
+	// Support for columns like table.column_name
+	case *parser.Path:
+		if len(n.Fields) > 0 {
+			extractColumnsHelper(n.Fields[len(n.Fields)-1], columnMap)
+		}
+		return
+
+	// Add more cases as needed for other expression types
+
+	default:
+		// For unknown types, return empty (don't extract columns)
+		return
+	}
+}
--- a/pkg/parser/queryfilterextractor/clickhouse_originparser_test.go
+++ b/pkg/parser/queryfilterextractor/clickhouse_originparser_test.go
@@ -0,0 +1,237 @@
+package queryfilterextractor
+
+import (
+	"testing"
+)
+
+func TestExtractOriginField(t *testing.T) {
+	tests := []struct {
+		name        string
+		query       string
+		expected    string
+		expectError bool
+	}{
+		// JSON extraction functions - should return the second argument (JSON path/key) as origin field
+		{
+			name:     "JSONExtractString simple",
+			query:    `SELECT JSONExtractString(labels, 'service.name')`,
+			expected: "service.name",
+		},
+		{
+			name:     "JSONExtractInt",
+			query:    `SELECT JSONExtractInt(labels, 'status.code')`,
+			expected: "status.code",
+		},
+		{
+			name:     "JSONExtractFloat",
+			query:    `SELECT JSONExtractFloat(labels, 'cpu.usage')`,
+			expected: "cpu.usage",
+		},
+		{
+			name:     "JSONExtractBool",
+			query:    `SELECT JSONExtractBool(labels, 'feature.enabled')`,
+			expected: "feature.enabled",
+		},
+		{
+			name:     "JSONExtractString with function wrapper",
+			query:    `SELECT lower(JSONExtractString(labels, 'user.email'))`,
+			expected: "user.email",
+		},
+		{
+			name:     "Nested JSON extraction",
+			query:    `SELECT JSONExtractInt(JSONExtractRaw(labels, 'meta'), 'status.code')`,
+			expected: "", // Nested JSON extraction should return blank
+		},
+
+		// Nested functions - should return the deepest column
+		{
+			name:     "Nested time functions with column",
+			query:    `SELECT toStartOfInterval(toDateTime(intDiv(unix_milli, 1000)), toIntervalSecond(60))`,
+			expected: "", // Contains toStartOfInterval and toDateTime which are excluded
+		},
+		{
+			name:     "Division with column",
+			query:    `SELECT unix_milli / 1000`,
+			expected: "unix_milli",
+		},
+		{
+			name:     "Function with single column",
+			query:    `SELECT lower(unix_milli)`,
+			expected: "unix_milli",
+		},
+		{
+			name:     "CAST with single column",
+			query:    `SELECT CAST(unix_milli AS String)`,
+			expected: "unix_milli",
+		},
+		{
+			name:     "intDiv with single column",
+			query:    `SELECT intDiv(unix_milli, 1000)`,
+			expected: "unix_milli",
+		},
+
+		// Multiple columns - should return blank
+		{
+			name:     "Multiple columns in coalesce",
+			query:    `SELECT (coalesce(cpu_usage, 0) + coalesce(mem_usage, 0)) / 2`,
+			expected: "",
+		},
+		{
+			name:     "Multiple columns in arithmetic",
+			query:    `SELECT cpu_usage + mem_usage`,
+			expected: "",
+		},
+		{
+			name:     "Multiple columns in function",
+			query:    `SELECT concat(first_name, last_name)`,
+			expected: "",
+		},
+
+		// IF/CASE conditions - should return blank
+		{
+			name:     "IF with single column in condition",
+			query:    `SELECT IF(error_count > 0, service, 'healthy')`,
+			expected: "", // Multiple columns: error_count and service
+		},
+		{
+			name:     "IF with JSON and multiple columns",
+			query:    `SELECT if(JSONExtractInt(metadata, 'retry.count') > 3, toLower(JSONExtractString(metadata, 'user.id')), hostname)`,
+			expected: "", // Multiple columns: metadata and hostname
+		},
+		{
+			name:     "String literal should return string",
+			query:    `SELECT 'constant'`,
+			expected: "constant",
+		},
+
+		// No columns - should return blank
+		{
+			name:     "Number literal",
+			query:    `SELECT 42`,
+			expected: "",
+		},
+		{
+			name:     "Multiple literals",
+			query:    `SELECT 'constant', 42`,
+			expected: "",
+		},
+		{
+			name:     "Multiple string literals",
+			query:    `SELECT 'constant', '42'`,
+			expected: "",
+		},
+
+		// Excluded functions - should return blank
+		{
+			name:     "now() function",
+			query:    `SELECT now()`,
+			expected: "",
+		},
+		{
+			name:     "today() function",
+			query:    `SELECT today()`,
+			expected: "",
+		},
+		{
+			name:     "count aggregate",
+			query:    `SELECT count(user_id)`,
+			expected: "",
+		},
+		{
+			name:     "sum aggregate",
+			query:    `SELECT sum(amount)`,
+			expected: "",
+		},
+
+		// Single column simple cases
+		{
+			name:     "Simple column reference",
+			query:    `SELECT user_id`,
+			expected: "user_id",
+		},
+		{
+			name:     "Column with alias",
+			query:    `SELECT user_id AS id`,
+			expected: "user_id",
+		},
+		{
+			name:     "Column in arithmetic with literals (multiplication)",
+			query:    `SELECT unix_milli * 1000`,
+			expected: "unix_milli",
+		},
+
+		// Edge cases
+		{
+			name:     "Nested functions with single column deep",
+			query:    `SELECT upper(lower(trim(column_name)))`,
+			expected: "column_name",
+		},
+		// Qualified column names (Path)
+		{
+			name:     "Column with table prefix",
+			query:    `SELECT table.column_name`,
+			expected: "column_name", // IndexOperation: extracts column name from Index field
+		},
+		{
+			name:     "Qualified column in function",
+			query:    `SELECT lower(table.column_name)`,
+			expected: "column_name",
+		},
+		{
+			name:     "Qualified column in arithmetic",
+			query:    `SELECT table.column_name * 100`,
+			expected: "column_name",
+		},
+		{
+			name:     "Nested qualified column (schema.table.column)",
+			query:    `SELECT schema.table.column_name`,
+			expected: "column_name", // Should extract the final column name
+		},
+		{
+			name:     "Multiple qualified columns",
+			query:    `SELECT table1.column1 + table2.column2`,
+			expected: "", // Multiple columns: column1 and column2
+		},
+		{
+			name:     "Qualified column with CAST",
+			query:    `SELECT CAST(table.column_name AS String)`,
+			expected: "column_name",
+		},
+		{
+			name:     "Multiple select items - return blank",
+			query:    `SELECT JSONExtractString(labels, 'service.name'), unix_milli / 1000, cpu_usage + mem_usage`,
+			expected: "",
+		},
+
+		// Error cases
+		{
+			name:        "Invalid SQL syntax",
+			query:       `SELECT FROM table`,
+			expectError: true,
+		},
+		{
+			name:        "Malformed query",
+			query:       `SELECT * FROM`,
+			expectError: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := extractCHOriginFieldFromQuery(tt.query)
+
+			if tt.expectError {
+				if err == nil {
+					t.Errorf("ExtractOriginField() expected error but got nil, result = %q", result)
+				}
+			} else {
+				if err != nil {
+					t.Errorf("ExtractOriginField() unexpected error: %v", err)
+				}
+				if result != tt.expected {
+					t.Errorf("ExtractOriginField() = %q, want %q", result, tt.expected)
+				}
+			}
+		})
+	}
+}
--- a/pkg/parser/queryfilterextractor/clickhouse_test.go
+++ b/pkg/parser/queryfilterextractor/clickhouse_test.go
--- a/pkg/parser/queryfilterextractor/promql.go
+++ b/pkg/parser/queryfilterextractor/promql.go
@@ -0,0 +1,115 @@
+package queryfilterextractor
+
+import (
+	"github.com/SigNoz/signoz/pkg/errors"
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/promql/parser"
+)
+
+// PromQLFilterExtractor extracts metric names and grouping keys from PromQL queries
+type PromQLFilterExtractor struct{}
+
+// NewPromQLFilterExtractor creates a new PromQL filter extractor
+func NewPromQLFilterExtractor() *PromQLFilterExtractor {
+	return &PromQLFilterExtractor{}
+}
+
+// Extract parses a PromQL query and extracts metric names and grouping keys
+func (e *PromQLFilterExtractor) Extract(query string) (*FilterResult, error) {
+	expr, err := parser.ParseExpr(query)
+	if err != nil {
+		return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "failed to parse promql query: %s", err.Error())
+	}
+
+	result := &FilterResult{
+		MetricNames:    []string{},
+		GroupByColumns: []ColumnInfo{},
+	}
+
+	// Use a visitor to traverse the AST
+	visitor := &promQLVisitor{
+		metricNames: make(map[string]bool),
+		groupBy:     make(map[string]bool),
+	}
+
+	// Walk the AST
+	if err := parser.Walk(visitor, expr, nil); err != nil {
+		return result, errors.NewInternalf(errors.CodeInternal, "failed to walk promql query: %s", err.Error())
+	}
+
+	// Convert sets to slices
+	for metric := range visitor.metricNames {
+		result.MetricNames = append(result.MetricNames, metric)
+	}
+	for groupKey := range visitor.groupBy {
+		result.GroupByColumns = append(result.GroupByColumns, ColumnInfo{Name: groupKey, OriginExpr: groupKey, OriginField: groupKey})
+	}
+
+	return result, nil
+}
+
+// promQLVisitor implements the parser.Visitor interface
+type promQLVisitor struct {
+	metricNames map[string]bool
+	groupBy     map[string]bool
+	// Track if we've already captured grouping from an outermost aggregation
+	hasOutermostGrouping bool
+}
+
+func (v *promQLVisitor) Visit(node parser.Node, path []parser.Node) (parser.Visitor, error) {
+	switch n := node.(type) {
+	case *parser.VectorSelector:
+		v.visitVectorSelector(n)
+	case *parser.AggregateExpr:
+		v.visitAggregateExpr(n, path)
+	}
+
+	return v, nil
+}
+
+// visitVectorSelector will be called whenever the Visitor encounters a VectorSelector node.
+// in the case we'll be extracting the metric names from the vector selector.
+func (v *promQLVisitor) visitVectorSelector(vs *parser.VectorSelector) {
+	// Check if metric name is specified directly
+	if vs.Name != "" {
+		v.metricNames[vs.Name] = true
+	}
+
+	// Check for __name__ label matcher
+	for _, matcher := range vs.LabelMatchers {
+		if matcher.Name == labels.MetricName {
+			switch matcher.Type {
+			case labels.MatchEqual:
+				v.metricNames[matcher.Value] = true
+				// Skip for negative filters - negative filters don't extract metric names
+				// case labels.MatchNotEqual, labels.MatchRegexp, labels.MatchNotRegexp:
+			}
+		}
+	}
+}
+
+// visitAggregateExpr will be called whenever the Visitor encounters an AggregateExpr node.
+// in the case we'll be extracting the grouping keys from the outermost aggregation.
+func (v *promQLVisitor) visitAggregateExpr(ae *parser.AggregateExpr, path []parser.Node) {
+	// Count how many AggregateExpr nodes are in the path (excluding current node)
+	// This tells us the nesting level
+	nestingLevel := 0
+	for _, p := range path {
+		if _, ok := p.(*parser.AggregateExpr); ok {
+			nestingLevel++
+		}
+	}
+
+	// Only capture grouping from the outermost aggregation (nesting level 0)
+	if nestingLevel == 0 && !v.hasOutermostGrouping {
+		// If Without is true, we skip grouping per spec
+		if !ae.Without && len(ae.Grouping) > 0 {
+			v.hasOutermostGrouping = true
+			for _, label := range ae.Grouping {
+				v.groupBy[label] = true
+			}
+		}
+	}
+
+	// Continue traversal to find metrics in the expression
+}
--- a/pkg/parser/queryfilterextractor/promql_test.go
+++ b/pkg/parser/queryfilterextractor/promql_test.go
@@ -0,0 +1,205 @@
+package queryfilterextractor
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestPromQLFilterExtractor_Extract(t *testing.T) {
+	extractor := NewPromQLFilterExtractor()
+
+	tests := []struct {
+		name               string
+		query              string
+		wantMetrics        []string
+		wantGroupByColumns []ColumnInfo
+		wantError          bool
+	}{
+		{
+			name:               "P1 - Simple vector selector",
+			query:              `http_requests_total{job="api"}`,
+			wantMetrics:        []string{"http_requests_total"},
+			wantGroupByColumns: []ColumnInfo{},
+		},
+		{
+			name:               "P2 - Function call",
+			query:              `rate(cpu_usage_seconds_total[5m])`,
+			wantMetrics:        []string{"cpu_usage_seconds_total"},
+			wantGroupByColumns: []ColumnInfo{},
+		},
+		{
+			name:               "P3 - Aggregation with by()",
+			query:              `sum by (pod,region) (rate(http_requests_total[5m]))`,
+			wantMetrics:        []string{"http_requests_total"},
+			wantGroupByColumns: []ColumnInfo{{Name: "pod", OriginExpr: "pod", OriginField: "pod"}, {Name: "region", OriginExpr: "region", OriginField: "region"}},
+		},
+		{
+			name:               "P4 - Aggregation with without()",
+			query:              `sum without (instance) (rate(cpu_usage_total[1m]))`,
+			wantMetrics:        []string{"cpu_usage_total"},
+			wantGroupByColumns: []ColumnInfo{}, // without() means no grouping keys per spec
+		},
+		{
+			name:               "P5 - Invalid: metric name set twice",
+			query:              `sum(rate(http_requests_total{__name__!="http_requests_error_total"}[5m]))`,
+			wantMetrics:        []string{},
+			wantGroupByColumns: []ColumnInfo{},
+			wantError:          true,
+		},
+		{
+			name:               "P6 - Regex negative label",
+			query:              `sum(rate(http_requests_total{status!~"5.."}[5m]))`,
+			wantMetrics:        []string{"http_requests_total"},
+			wantGroupByColumns: []ColumnInfo{},
+		},
+		{
+			name:               "P7 - Nested aggregations",
+			query:              `sum by (region) (max by (pod, region) (cpu_usage_total{env="prod"}))`,
+			wantMetrics:        []string{"cpu_usage_total"},
+			wantGroupByColumns: []ColumnInfo{{Name: "region", OriginExpr: "region", OriginField: "region"}}, // Only outermost grouping
+		},
+		{
+			name:               "P7a - Nested aggregation: inner grouping ignored",
+			query:              `sum(max by (pod) (cpu_usage_total{env="prod"}))`,
+			wantMetrics:        []string{"cpu_usage_total"},
+			wantGroupByColumns: []ColumnInfo{}, // Inner grouping is ignored when outer has no grouping (nestingLevel != 0 case)
+		},
+		{
+			name:               "P8 - Arithmetic expression",
+			query:              `(http_requests_total{job="api"} + http_errors_total{job="api"})`,
+			wantMetrics:        []string{"http_requests_total", "http_errors_total"},
+			wantGroupByColumns: []ColumnInfo{},
+		},
+		{
+			name:               "P9 - Mix of positive metric & exclusion label",
+			query:              `sum by (region)(rate(foo{job!="db"}[5m]))`,
+			wantMetrics:        []string{"foo"},
+			wantGroupByColumns: []ColumnInfo{{Name: "region", OriginExpr: "region", OriginField: "region"}},
+		},
+		{
+			name:               "P10 - Function + aggregation",
+			query:              `histogram_quantile(0.9, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))`,
+			wantMetrics:        []string{"http_request_duration_seconds_bucket"},
+			wantGroupByColumns: []ColumnInfo{{Name: "le", OriginExpr: "le", OriginField: "le"}},
+		},
+		{
+			name:               "P11 - Subquery",
+			query:              `sum_over_time(cpu_usage_total[1h:5m])`,
+			wantMetrics:        []string{"cpu_usage_total"},
+			wantGroupByColumns: []ColumnInfo{},
+		},
+		{
+			name:               "P12 - Nested aggregation inside subquery",
+			query:              `max_over_time(sum(rate(cpu_usage_total[5m]))[1h:5m])`,
+			wantMetrics:        []string{"cpu_usage_total"},
+			wantGroupByColumns: []ColumnInfo{},
+		},
+		{
+			name:               "P13 - Subquery with multiple metrics",
+			query:              `avg_over_time((foo + bar)[10m:1m])`,
+			wantMetrics:        []string{"foo", "bar"},
+			wantGroupByColumns: []ColumnInfo{},
+		},
+		{
+			name:               "P14 - Simple meta-metric",
+			query:              `sum by (pod) (up)`,
+			wantMetrics:        []string{"up"},
+			wantGroupByColumns: []ColumnInfo{{Name: "pod", OriginExpr: "pod", OriginField: "pod"}},
+		},
+		{
+			name:               "P15 - Binary operator unless",
+			query:              `sum(rate(http_requests_total[5m])) unless avg(rate(http_errors_total[5m]))`,
+			wantMetrics:        []string{"http_requests_total", "http_errors_total"},
+			wantGroupByColumns: []ColumnInfo{},
+		},
+		{
+			name:               "P16 - Vector matching",
+			query:              `sum(rate(foo[5m])) / ignoring(instance) group_left(job) sum(rate(bar[5m]))`,
+			wantMetrics:        []string{"foo", "bar"},
+			wantGroupByColumns: []ColumnInfo{},
+		},
+		{
+			name:               "P17 - Offset modifier with aggregation",
+			query:              `sum by (env)(rate(cpu_usage_seconds_total{job="api"}[5m] offset 1h))`,
+			wantMetrics:        []string{"cpu_usage_seconds_total"},
+			wantGroupByColumns: []ColumnInfo{{Name: "env", OriginExpr: "env", OriginField: "env"}},
+		},
+		{
+			name:               "P18 - Invalid syntax",
+			query:              `sum by ((foo)(bar))(http_requests_total)`,
+			wantMetrics:        []string{},
+			wantGroupByColumns: []ColumnInfo{},
+			wantError:          true,
+		},
+		{
+			name:               "P19 - Literal expression",
+			query:              `2 + 3`,
+			wantMetrics:        []string{},
+			wantGroupByColumns: []ColumnInfo{},
+		},
+		{
+			name:               "P20 - Aggregation inside subquery with deriv",
+			query:              `deriv(sum by (instance)(rate(node_network_receive_bytes_total[5m]))[30m:5m])`,
+			wantMetrics:        []string{"node_network_receive_bytes_total"},
+			wantGroupByColumns: []ColumnInfo{{Name: "instance", OriginExpr: "instance", OriginField: "instance"}}, // Aggregation is inside subquery, not outermost
+		},
+		{
+			name:               "P21 - Aggregation inside subquery with avg_over_time",
+			query:              `avg_over_time(sum by (job)(rate(http_requests_total[1m]))[30m:1m])`,
+			wantMetrics:        []string{"http_requests_total"},
+			wantGroupByColumns: []ColumnInfo{{Name: "job", OriginExpr: "job", OriginField: "job"}}, // Aggregation is inside subquery, not outermost
+		},
+		{
+			name:               "P22 - Aggregation inside subquery with max_over_time",
+			query:              `max_over_time(sum by (pod)(rate(container_restarts_total[5m]))[1h:5m])`,
+			wantMetrics:        []string{"container_restarts_total"},
+			wantGroupByColumns: []ColumnInfo{{Name: "pod", OriginExpr: "pod", OriginField: "pod"}}, // Aggregation is inside subquery, not outermost
+		},
+		{
+			name:               "P23 - Aggregation inside subquery with deriv (no rate)",
+			query:              `deriv(sum by (namespace)(container_memory_working_set_bytes)[1h:10m])`,
+			wantMetrics:        []string{"container_memory_working_set_bytes"},
+			wantGroupByColumns: []ColumnInfo{{Name: "namespace", OriginExpr: "namespace", OriginField: "namespace"}}, // Aggregation is inside subquery, not outermost
+		},
+		{
+			name:               "P24 - Aggregation inside subquery with histogram_quantile",
+			query:              `histogram_quantile(0.95, avg_over_time(sum by (le, service)(rate(http_request_duration_seconds_bucket[5m]))[1h:5m]))`,
+			wantMetrics:        []string{"http_request_duration_seconds_bucket"},
+			wantGroupByColumns: []ColumnInfo{{Name: "le", OriginExpr: "le", OriginField: "le"}, {Name: "service", OriginExpr: "service", OriginField: "service"}}, // Aggregation is inside subquery, not outermost
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := extractor.Extract(tt.query)
+
+			// Check error expectation
+			if tt.wantError {
+				if err == nil {
+					t.Errorf("Extract() expected error but got none, query: %s", tt.query)
+				}
+				return
+			}
+			if err != nil {
+				t.Errorf("Extract() unexpected error = %v, query: %s", err, tt.query)
+				return
+			}
+
+			// Sort for comparison
+			gotMetrics := sortStrings(result.MetricNames)
+			wantMetrics := sortStrings(tt.wantMetrics)
+
+			if !reflect.DeepEqual(gotMetrics, wantMetrics) {
+				t.Errorf("Extract() MetricNames = %v, want %v", gotMetrics, wantMetrics)
+			}
+
+			// Test GroupByColumns - need to normalize for comparison (order may vary)
+			gotGroupByColumns := sortColumnInfo(result.GroupByColumns)
+			wantGroupByColumns := sortColumnInfo(tt.wantGroupByColumns)
+
+			if !reflect.DeepEqual(gotGroupByColumns, wantGroupByColumns) {
+				t.Errorf("Extract() GroupByColumns = %v, want %v", gotGroupByColumns, wantGroupByColumns)
+			}
+		})
+	}
+}
--- a/pkg/parser/queryfilterextractor/queryfilterextractor.go
+++ b/pkg/parser/queryfilterextractor/queryfilterextractor.go
@@ -0,0 +1,58 @@
+// Package queryfilterextractor provides utilities for extracting metric names
+// and grouping keys.
+//
+// This is useful for metrics discovery, and query analysis.
+package queryfilterextractor
+
+import "github.com/SigNoz/signoz/pkg/errors"
+
+const (
+	ExtractorCH     = "qfe_ch"
+	ExtractorPromQL = "qfe_promql"
+)
+
+// ColumnInfo represents a column in the query
+type ColumnInfo struct {
+	Name        string
+	Alias       string
+	OriginExpr  string
+	OriginField string
+}
+
+// GroupName returns the field name in the resulting data which is used for grouping
+//
+//   - examples:
+//
+//   - SELECT region as new_region FROM metrics WHERE metric_name='cpu' GROUP BY region
+//     GroupName() will return "new_region"
+//
+//   - SELECT region FROM metrics WHERE metric_name='cpu' GROUP BY region
+//     GroupName() will return "region"
+func (c *ColumnInfo) GroupName() string {
+	if c.Alias != "" {
+		return c.Alias
+	}
+	return c.Name
+}
+
+type FilterResult struct {
+	// MetricNames are the metrics that are being filtered on
+	MetricNames []string
+	// GroupByColumns are the columns that are being grouped by
+	GroupByColumns []ColumnInfo
+}
+
+type FilterExtractor interface {
+	Extract(query string) (*FilterResult, error)
+}
+
+func NewExtractor(extractorType string) (FilterExtractor, error) {
+	switch extractorType {
+	case ExtractorCH:
+		return NewClickHouseFilterExtractor(), nil
+	case ExtractorPromQL:
+		return NewPromQLFilterExtractor(), nil
+	default:
+		return nil, errors.Newf(errors.TypeInvalidInput, errors.CodeInvalidInput, "invalid extractor type: %s", extractorType)
+	}
+}