From 55dc32d06951b2df5b1dced8afbe9a9d5b0a65b1 Mon Sep 17 00:00:00 2001
From: Sofia Soares <sofia.soares@tecnico.ulisboa.pt>
Date: Thu, 3 Apr 2025 14:59:37 +0100
Subject: [PATCH 1/3] BUG: Fix #57608: queries on categorical string columns in
 HDFStore.select() return unexpected results. In function __init__() of class
 Selection (pandas/core/io/pytables.py), the method self.terms.evaluate() was
 not returning the correct value for the where condition. The issue stemmed
 from the function convert_value() of class BinOp
 (pandas/core/computation/pytables.py), where the function searchedsorted()
 did not return the correct index when matching the where condition in the
 metadata (categories table). Replacing searchsorted() with np.where()
 resolves this issue.

---
 doc/source/whatsnew/v3.0.0.rst         |  1 +
 pandas/core/computation/pytables.py    |  3 ++-
 pandas/tests/io/pytables/test_store.py | 23 +++++++++++++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index e6fafc8b1b14c..9cd79dc58d9d2 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -736,6 +736,7 @@ I/O
 - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`)
 - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
 - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`)
+- Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`)
 - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
index 166c9d47294cd..ae895c0424014 100644
--- a/pandas/core/computation/pytables.py
+++ b/pandas/core/computation/pytables.py
@@ -239,7 +239,8 @@ def stringify(value):
             if conv_val not in metadata:
                 result = -1
             else:
-                result = metadata.searchsorted(conv_val, side="left")
+                # Find the index of the first match of conv_val in metadata
+                result = np.where(metadata == conv_val)[0][0]
             return TermValue(result, result, "integer")
         elif kind == "integer":
             try:
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
index bb2058c050f2a..5c32f2c8a4d8d 100644
--- a/pandas/tests/io/pytables/test_store.py
+++ b/pandas/tests/io/pytables/test_store.py
@@ -25,6 +25,9 @@
     timedelta_range,
 )
 import pandas._testing as tm
+from pandas.api.types import (
+    CategoricalDtype,
+)
 from pandas.conftest import has_pyarrow
 from pandas.tests.io.pytables.common import (
     _maybe_remove,
@@ -1106,3 +1109,23 @@ def test_store_bool_index(tmp_path, setup_path):
     df.to_hdf(path, key="a")
     result = read_hdf(path, "a")
     tm.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize("model", ["name", "longname", "verylongname"])
+def test_select_categorical_string_columns(tmp_path, model):
+    # Corresponding to BUG: 57608
+
+    path = tmp_path / "test.h5"
+
+    models = CategoricalDtype(categories=["name", "longname", "verylongname"])
+    df = DataFrame(
+        {"modelId": ["name", "longname", "longname"], "value": [1, 2, 3]}
+    ).astype({"modelId": models, "value": int})
+
+    with HDFStore(path, "w") as store:
+        store.append("df", df, data_columns=["modelId"])
+
+    with HDFStore(path, "r") as store:
+        result = store.select("df", "modelId == model")
+        expected = df[df["modelId"] == model]
+        tm.assert_frame_equal(result, expected)

From 386b56645d36848b076123f41d42af3dfceb5be4 Mon Sep 17 00:00:00 2001
From: Sofia Soares <sofia.soares@tecnico.ulisboa.pt>
Date: Fri, 4 Apr 2025 14:39:13 +0100
Subject: [PATCH 2/3] BUG: Follow-up for #57608: check if metadata is sorted
 before search

---
 pandas/core/computation/pytables.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
index ae895c0424014..99aa8c6ebcc2d 100644
--- a/pandas/core/computation/pytables.py
+++ b/pandas/core/computation/pytables.py
@@ -239,8 +239,13 @@ def stringify(value):
             if conv_val not in metadata:
                 result = -1
             else:
-                # Find the index of the first match of conv_val in metadata
-                result = np.where(metadata == conv_val)[0][0]
+                # Check if metadata is sorted
+                if np.all(metadata[:-1] <= metadata[1:]):
+                    # If it is, use searchsorted for efficient lookup
+                    result = metadata.searchsorted(conv_val, side="left")
+                else:
+                    # Find the index of the first match of conv_val in metadata
+                    result = np.flatnonzero(metadata == conv_val)[0]
             return TermValue(result, result, "integer")
         elif kind == "integer":
             try:

From 6243e26581e3a622c0cc8ef1ed9dd75da4c1f16c Mon Sep 17 00:00:00 2001
From: Sofia Soares <sofia.soares@tecnico.ulisboa.pt>
Date: Tue, 8 Apr 2025 13:17:33 +0100
Subject: [PATCH 3/3] BUG: Follow-up for #57608: use direct match via
 np.flatnonzero

---
 pandas/core/computation/pytables.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
index 99aa8c6ebcc2d..77b7d9ad11a6c 100644
--- a/pandas/core/computation/pytables.py
+++ b/pandas/core/computation/pytables.py
@@ -239,13 +239,8 @@ def stringify(value):
             if conv_val not in metadata:
                 result = -1
             else:
-                # Check if metadata is sorted
-                if np.all(metadata[:-1] <= metadata[1:]):
-                    # If it is, use searchsorted for efficient lookup
-                    result = metadata.searchsorted(conv_val, side="left")
-                else:
-                    # Find the index of the first match of conv_val in metadata
-                    result = np.flatnonzero(metadata == conv_val)[0]
+                # Find the index of the first match of conv_val in metadata
+                result = np.flatnonzero(metadata == conv_val)[0]
             return TermValue(result, result, "integer")
         elif kind == "integer":
             try: