fix: Fix CSV select(len()) off by 1 with comment prefix (#25069)

nameexhaustion · web-flow · commit 5a4c0d2554bd · 2025-11-03T14:19:03.000+01:00
diff --git a/crates/polars-io/src/csv/read/mod.rs b/crates/polars-io/src/csv/read/mod.rs
@@ -26,15 +26,15 @@ mod splitfields;
 mod utils;
 
 pub use options::{CommentPrefix, CsvEncoding, CsvParseOptions, CsvReadOptions, NullValues};
-pub use parser::{count_rows, count_rows_from_slice_par, count_rows_from_slice_raw};
+pub use parser::{count_rows, count_rows_from_slice_par};
 pub use read_impl::batched::{BatchedCsvReader, OwnedBatchedCsvReader};
 pub use reader::CsvReader;
 pub use schema_inference::infer_file_schema;
 
 pub mod _csv_read_internal {
     pub use super::buffer::validate_utf8;
     pub use super::options::NullValuesCompiled;
-    pub use super::parser::CountLines;
+    pub use super::parser::{CountLines, is_comment_line};
     pub use super::read_impl::{cast_columns, find_starting_point, read_chunk};
     pub use super::reader::prepare_csv_schema;
 }
diff --git a/crates/polars-io/src/csv/read/parser.rs b/crates/polars-io/src/csv/read/parser.rs
@@ -113,16 +113,17 @@ pub fn count_rows_from_slice_par(
                         // Ensure we start at the start of a line.
                         if let Some(nl_off) = bytes[start_offset..next_start_offset]
                             .iter()
-                            .position(|b| *b == b'\n')
+                            .position(|b| *b == eol_char)
                         {
                             start_offset += nl_off + 1;
                         } else {
                             return count.analyze_chunk(&[]);
                         }
                     }
 
-                    let stop_offset = if let Some(nl_off) =
-                        bytes[next_start_offset..].iter().position(|b| *b == b'\n')
+                    let stop_offset = if let Some(nl_off) = bytes[next_start_offset..]
+                        .iter()
+                        .position(|b| *b == eol_char)
                     {
                         next_start_offset + nl_off + 1
                     } else {
@@ -140,28 +141,21 @@ pub fn count_rows_from_slice_par(
             n += pair[in_string as usize].newline_count;
             in_string = pair[in_string as usize].end_inside_string;
         }
-        if let Some(last) = bytes.last() {
-            n += (*last != eol_char) as usize;
+        if let Some(last) = bytes.last()
+            && *last != eol_char
+            && (comment_prefix.is_none()
+                || !is_comment_line(
+                    bytes.rsplit(|c| *c == eol_char).next().unwrap(),
+                    comment_prefix,
+                ))
+        {
+            n += 1
         }
+
         Ok(n)
     })
 }
 
-/// Read the number of rows without parsing columns, assuming bytes is at a
-/// newline starting point. Does not deal with start/header.
-pub fn count_rows_from_slice_raw(
-    bytes: &[u8],
-    quote_char: Option<u8>,
-    comment_prefix: Option<&CommentPrefix>,
-    eol_char: u8,
-) -> PolarsResult<usize> {
-    Ok(
-        CountLines::new(quote_char, eol_char, comment_prefix.cloned())
-            .count(bytes)
-            .0,
-    )
-}
-
 /// Skip the utf-8 Byte Order Mark.
 /// credits to csv-core
 pub(super) fn skip_bom(input: &[u8]) -> &[u8] {
@@ -176,7 +170,7 @@ pub(super) fn skip_bom(input: &[u8]) -> &[u8] {
 ///
 /// This function is used during CSV parsing to determine whether a line should be ignored based on its starting characters.
 #[inline]
-pub(super) fn is_comment_line(line: &[u8], comment_prefix: Option<&CommentPrefix>) -> bool {
+pub fn is_comment_line(line: &[u8], comment_prefix: Option<&CommentPrefix>) -> bool {
     match comment_prefix {
         Some(CommentPrefix::Single(c)) => line.first() == Some(c),
         Some(CommentPrefix::Multi(s)) => line.starts_with(s.as_bytes()),
@@ -770,7 +764,10 @@ impl CountLines {
             // Skip comment line if needed.
             while bytes[scan_offset..].starts_with(pre_s) {
                 scan_offset += pre_s.len();
-                let Some(nl_off) = bytes[scan_offset..].iter().position(|c| *c == b'\n') else {
+                let Some(nl_off) = bytes[scan_offset..]
+                    .iter()
+                    .position(|c| *c == self.eol_char)
+                else {
                     break;
                 };
                 scan_offset += nl_off + 1;
@@ -799,13 +796,18 @@ impl CountLines {
         loop {
             let b = unsafe { bytes.get_unchecked(..(*chunk_size).min(bytes.len())) };
 
-            let (count, offset) = self.count(b);
+            let (count, offset) = if self.comment_prefix.is_some() {
+                let stats = self.analyze_chunk_with_comment(b, false);
+                (stats.newline_count, stats.last_newline_offset)
+            } else {
+                self.count(b)
+            };
 
             if count > 0 || b.len() == bytes.len() {
                 return (count, offset);
             }
 
-            *chunk_size *= 2;
+            *chunk_size = chunk_size.saturating_mul(2);
         }
     }
 
diff --git a/crates/polars-io/src/csv/read/read_impl.rs b/crates/polars-io/src/csv/read/read_impl.rs
@@ -407,13 +407,13 @@ impl<'a> CoreReader<'a> {
                         std::ptr::eq(b.as_ptr().add(b.len()), bytes.as_ptr().add(bytes.len()))
                     } {
                     total_offset = bytes.len();
-                    (b, 1)
+                    let c = if is_comment_line(bytes, self.parse_options.comment_prefix.as_ref()) {
+                        0
+                    } else {
+                        1
+                    };
+                    (b, c)
                 } else {
-                    if count == 0 {
-                        chunk_size *= 2;
-                        continue;
-                    }
-
                     let end = total_offset + position + 1;
                     let b = unsafe { bytes.get_unchecked(total_offset..end) };
 
@@ -440,12 +440,21 @@ impl<'a> CoreReader<'a> {
                         let result = slf
                             .read_chunk(b, projection, 0, count, Some(0), b.len())
                             .and_then(|mut df| {
-
                                 // Check malformed
-                                if df.height() > count || (df.height() < count && slf.parse_options.comment_prefix.is_none()) {
+                                if df.height() > count
+                                    || (df.height() < count
+                                        && slf.parse_options.comment_prefix.is_none())
+                                {
                                     // Note: in case data is malformed, df.height() is more likely to be correct than count.
-                                    let msg = format!("CSV malformed: expected {} rows, actual {} rows, in chunk starting at byte offset {}, length {}",
-                                        count, df.height(), previous_total_offset, b.len());
+                                    let msg = format!(
+                                        "CSV malformed: expected {} rows, \
+                                        actual {} rows, in chunk starting at \
+                                        byte offset {}, length {}",
+                                        count,
+                                        df.height(),
+                                        previous_total_offset,
+                                        b.len()
+                                    );
                                     if slf.ignore_errors {
                                         polars_warn!(msg);
                                     } else {
@@ -482,9 +491,7 @@ impl<'a> CoreReader<'a> {
 
                     // Check just after we spawned a chunk. That mean we processed all data up until
                     // row count.
-                    if self.n_rows.is_some()
-                        && total_line_count.load() > self.n_rows.unwrap()
-                    {
+                    if self.n_rows.is_some() && total_line_count.load() > self.n_rows.unwrap() {
                         break;
                     }
                 }
diff --git a/crates/polars-stream/src/nodes/io_sources/csv.rs b/crates/polars-stream/src/nodes/io_sources/csv.rs
@@ -8,13 +8,11 @@ use polars_error::{PolarsResult, polars_bail, polars_err, polars_warn};
 use polars_io::RowIndex;
 use polars_io::cloud::CloudOptions;
 use polars_io::prelude::_csv_read_internal::{
-    CountLines, NullValuesCompiled, cast_columns, find_starting_point, prepare_csv_schema,
-    read_chunk,
+    CountLines, NullValuesCompiled, cast_columns, find_starting_point, is_comment_line,
+    prepare_csv_schema, read_chunk,
 };
 use polars_io::prelude::buffer::validate_utf8;
-use polars_io::prelude::{
-    CommentPrefix, CsvEncoding, CsvParseOptions, CsvReadOptions, count_rows_from_slice_raw,
-};
+use polars_io::prelude::{CsvEncoding, CsvParseOptions, CsvReadOptions};
 use polars_io::utils::compression::maybe_decompress_bytes;
 use polars_io::utils::slice::SplitSlicePosition;
 use polars_plan::dsl::ScanSource;
@@ -251,15 +249,11 @@ impl FileReader for CsvFileReader {
             )
         }
 
-        // Only used on empty projection, or if we need the exact row count.
-        let alt_count_lines: Option<Arc<CountLinesWithComments>> =
-            CountLinesWithComments::opt_new(&self.options.parse_options).map(Arc::new);
         let chunk_reader = Arc::new(ChunkReader::try_new(
             self.options.clone(),
             inferred_schema.clone(),
             projection,
             row_index,
-            alt_count_lines.clone(),
         )?);
 
         let needs_full_row_count = n_rows_in_file_tx.is_some();
@@ -304,7 +298,6 @@ impl FileReader for CsvFileReader {
                 let chunk_reader = chunk_reader.clone();
                 // Note: We don't use this (it is handled by the bridge). But morsels require a source token.
                 let source_token = SourceToken::new();
-                let alt_count_lines = alt_count_lines.clone();
 
                 AbortOnDropHandle::new(spawn(TaskPriority::Low, async move {
                     while let Ok(LineBatch {
@@ -352,7 +345,7 @@ impl FileReader for CsvFileReader {
                         }
 
                         while let Ok(LineBatch {
-                            bytes,
+                            bytes: _,
                             n_lines,
                             slice,
                             row_offset: _,
@@ -361,12 +354,6 @@ impl FileReader for CsvFileReader {
                         {
                             assert_eq!(slice, SLICE_ENDED);
 
-                            let n_lines = if let Some(v) = alt_count_lines.as_deref() {
-                                v.count_lines(bytes)?
-                            } else {
-                                n_lines
-                            };
-
                             n_rows_processed = n_rows_processed.saturating_add(n_lines);
                         }
                     }
@@ -473,17 +460,18 @@ impl LineBatchSource {
 
         let global_bytes: &[u8] = memslice.as_ref();
         let global_bytes: &'static [u8] = unsafe { std::mem::transmute(global_bytes) };
+        let comment_prefix = options.parse_options.comment_prefix.as_ref();
 
-        let i = {
-            let parse_options = options.parse_options.as_ref();
+        let parse_options = options.parse_options.as_ref();
+        let eol_char = parse_options.eol_char;
 
+        let i = {
             let quote_char = parse_options.quote_char;
-            let eol_char = parse_options.eol_char;
 
             let skip_lines = options.skip_lines;
             let skip_rows_before_header = options.skip_rows;
             let skip_rows_after_header = options.skip_rows_after_header;
-            let comment_prefix = parse_options.comment_prefix.clone();
+            let comment_prefix = comment_prefix.cloned();
             let has_header = options.has_header;
 
             find_starting_point(
@@ -524,7 +512,16 @@ impl LineBatchSource {
 
             let (count, position) = line_counter.find_next(bytes, &mut chunk_size);
             let (count, position) = if count == 0 {
-                (1, bytes.len())
+                let c = if *bytes.last().unwrap() != eol_char
+                    && !is_comment_line(
+                        bytes.rsplit(|c| *c == eol_char).next().unwrap(),
+                        comment_prefix,
+                    ) {
+                    1
+                } else {
+                    0
+                };
+                (c, bytes.len())
             } else {
                 let pos = (position + 1).min(bytes.len()); // +1 for '\n'
                 (count, pos)
@@ -596,8 +593,6 @@ struct ChunkReader {
     null_values: Option<NullValuesCompiled>,
     validate_utf8: bool,
     row_index: Option<RowIndex>,
-    // Alternate line counter when there are comments. This is used on empty projection.
-    alt_count_lines: Option<Arc<CountLinesWithComments>>,
 }
 
 impl ChunkReader {
@@ -606,7 +601,6 @@ impl ChunkReader {
         mut reader_schema: SchemaRef,
         projection: Vec<usize>,
         row_index: Option<RowIndex>,
-        alt_count_lines: Option<Arc<CountLinesWithComments>>,
     ) -> PolarsResult<Self> {
         let mut fields_to_cast: Vec<Field> = options.fields_to_cast.clone();
         prepare_csv_schema(&mut reader_schema, &mut fields_to_cast)?;
@@ -633,7 +627,6 @@ impl ChunkReader {
             null_values,
             validate_utf8,
             row_index,
-            alt_count_lines,
         })
     }
 
@@ -652,13 +645,7 @@ impl ChunkReader {
 
         // If projection is empty create a DataFrame with the correct height by counting the lines.
         let mut df = if self.projection.is_empty() {
-            let h = if let Some(v) = &self.alt_count_lines {
-                v.count_lines(chunk)?
-            } else {
-                n_lines
-            };
-
-            DataFrame::empty_with_height(h)
+            DataFrame::empty_with_height(n_lines)
         } else {
             read_chunk(
                 chunk,
@@ -679,9 +666,7 @@ impl ChunkReader {
         let n_lines_is_correct = df.height() == n_lines;
 
         // Check malformed
-        if df.height() > n_lines
-            || (df.height() < n_lines && self.parse_options.comment_prefix.is_none())
-        {
+        if !n_lines_is_correct {
             // Note: in case data is malformed, df.height() is more likely to be correct than n_lines.
             let msg = format!(
                 "CSV malformed: expected {} rows, actual {} rows, in chunk starting at row_offset {}, length {}",
@@ -722,31 +707,3 @@ impl ChunkReader {
         Ok((df, height))
     }
 }
-
-struct CountLinesWithComments {
-    quote_char: Option<u8>,
-    eol_char: u8,
-    comment_prefix: CommentPrefix,
-}
-
-impl CountLinesWithComments {
-    fn opt_new(parse_options: &CsvParseOptions) -> Option<Self> {
-        parse_options
-            .comment_prefix
-            .clone()
-            .map(|comment_prefix| CountLinesWithComments {
-                quote_char: parse_options.quote_char,
-                eol_char: parse_options.eol_char,
-                comment_prefix,
-            })
-    }
-
-    fn count_lines(&self, bytes: &[u8]) -> PolarsResult<usize> {
-        count_rows_from_slice_raw(
-            bytes,
-            self.quote_char,
-            Some(&self.comment_prefix),
-            self.eol_char,
-        )
-    }
-}
diff --git a/py-polars/tests/unit/io/test_lazy_count_star.py b/py-polars/tests/unit/io/test_lazy_count_star.py