@@ -8,13 +8,11 @@ use polars_error::{PolarsResult, polars_bail, polars_err, polars_warn};
88use polars_io:: RowIndex ;
99use polars_io:: cloud:: CloudOptions ;
1010use polars_io:: prelude:: _csv_read_internal:: {
11- CountLines , NullValuesCompiled , cast_columns, find_starting_point, prepare_csv_schema ,
12- read_chunk,
11+ CountLines , NullValuesCompiled , cast_columns, find_starting_point, is_comment_line ,
12+ prepare_csv_schema , read_chunk,
1313} ;
1414use polars_io:: prelude:: buffer:: validate_utf8;
15- use polars_io:: prelude:: {
16- CommentPrefix , CsvEncoding , CsvParseOptions , CsvReadOptions , count_rows_from_slice_raw,
17- } ;
15+ use polars_io:: prelude:: { CsvEncoding , CsvParseOptions , CsvReadOptions } ;
1816use polars_io:: utils:: compression:: maybe_decompress_bytes;
1917use polars_io:: utils:: slice:: SplitSlicePosition ;
2018use polars_plan:: dsl:: ScanSource ;
@@ -251,15 +249,11 @@ impl FileReader for CsvFileReader {
251249 )
252250 }
253251
254- // Only used on empty projection, or if we need the exact row count.
255- let alt_count_lines: Option < Arc < CountLinesWithComments > > =
256- CountLinesWithComments :: opt_new ( & self . options . parse_options ) . map ( Arc :: new) ;
257252 let chunk_reader = Arc :: new ( ChunkReader :: try_new (
258253 self . options . clone ( ) ,
259254 inferred_schema. clone ( ) ,
260255 projection,
261256 row_index,
262- alt_count_lines. clone ( ) ,
263257 ) ?) ;
264258
265259 let needs_full_row_count = n_rows_in_file_tx. is_some ( ) ;
@@ -304,7 +298,6 @@ impl FileReader for CsvFileReader {
304298 let chunk_reader = chunk_reader. clone ( ) ;
305299 // Note: We don't use this (it is handled by the bridge). But morsels require a source token.
306300 let source_token = SourceToken :: new ( ) ;
307- let alt_count_lines = alt_count_lines. clone ( ) ;
308301
309302 AbortOnDropHandle :: new ( spawn ( TaskPriority :: Low , async move {
310303 while let Ok ( LineBatch {
@@ -352,7 +345,7 @@ impl FileReader for CsvFileReader {
352345 }
353346
354347 while let Ok ( LineBatch {
355- bytes,
348+ bytes : _ ,
356349 n_lines,
357350 slice,
358351 row_offset : _,
@@ -361,12 +354,6 @@ impl FileReader for CsvFileReader {
361354 {
362355 assert_eq ! ( slice, SLICE_ENDED ) ;
363356
364- let n_lines = if let Some ( v) = alt_count_lines. as_deref ( ) {
365- v. count_lines ( bytes) ?
366- } else {
367- n_lines
368- } ;
369-
370357 n_rows_processed = n_rows_processed. saturating_add ( n_lines) ;
371358 }
372359 }
@@ -473,17 +460,18 @@ impl LineBatchSource {
473460
474461 let global_bytes: & [ u8 ] = memslice. as_ref ( ) ;
475462 let global_bytes: & ' static [ u8 ] = unsafe { std:: mem:: transmute ( global_bytes) } ;
463+ let comment_prefix = options. parse_options . comment_prefix . as_ref ( ) ;
476464
477- let i = {
478- let parse_options = options . parse_options . as_ref ( ) ;
465+ let parse_options = options . parse_options . as_ref ( ) ;
466+ let eol_char = parse_options. eol_char ;
479467
468+ let i = {
480469 let quote_char = parse_options. quote_char ;
481- let eol_char = parse_options. eol_char ;
482470
483471 let skip_lines = options. skip_lines ;
484472 let skip_rows_before_header = options. skip_rows ;
485473 let skip_rows_after_header = options. skip_rows_after_header ;
486- let comment_prefix = parse_options . comment_prefix . clone ( ) ;
474+ let comment_prefix = comment_prefix. cloned ( ) ;
487475 let has_header = options. has_header ;
488476
489477 find_starting_point (
@@ -524,7 +512,16 @@ impl LineBatchSource {
524512
525513 let ( count, position) = line_counter. find_next ( bytes, & mut chunk_size) ;
526514 let ( count, position) = if count == 0 {
527- ( 1 , bytes. len ( ) )
515+ let c = if * bytes. last ( ) . unwrap ( ) != eol_char
516+ && !is_comment_line (
517+ bytes. rsplit ( |c| * c == eol_char) . next ( ) . unwrap ( ) ,
518+ comment_prefix,
519+ ) {
520+ 1
521+ } else {
522+ 0
523+ } ;
524+ ( c, bytes. len ( ) )
528525 } else {
529526 let pos = ( position + 1 ) . min ( bytes. len ( ) ) ; // +1 for '\n'
530527 ( count, pos)
@@ -596,8 +593,6 @@ struct ChunkReader {
596593 null_values : Option < NullValuesCompiled > ,
597594 validate_utf8 : bool ,
598595 row_index : Option < RowIndex > ,
599- // Alternate line counter when there are comments. This is used on empty projection.
600- alt_count_lines : Option < Arc < CountLinesWithComments > > ,
601596}
602597
603598impl ChunkReader {
@@ -606,7 +601,6 @@ impl ChunkReader {
606601 mut reader_schema : SchemaRef ,
607602 projection : Vec < usize > ,
608603 row_index : Option < RowIndex > ,
609- alt_count_lines : Option < Arc < CountLinesWithComments > > ,
610604 ) -> PolarsResult < Self > {
611605 let mut fields_to_cast: Vec < Field > = options. fields_to_cast . clone ( ) ;
612606 prepare_csv_schema ( & mut reader_schema, & mut fields_to_cast) ?;
@@ -633,7 +627,6 @@ impl ChunkReader {
633627 null_values,
634628 validate_utf8,
635629 row_index,
636- alt_count_lines,
637630 } )
638631 }
639632
@@ -652,13 +645,7 @@ impl ChunkReader {
652645
653646 // If projection is empty create a DataFrame with the correct height by counting the lines.
654647 let mut df = if self . projection . is_empty ( ) {
655- let h = if let Some ( v) = & self . alt_count_lines {
656- v. count_lines ( chunk) ?
657- } else {
658- n_lines
659- } ;
660-
661- DataFrame :: empty_with_height ( h)
648+ DataFrame :: empty_with_height ( n_lines)
662649 } else {
663650 read_chunk (
664651 chunk,
@@ -679,9 +666,7 @@ impl ChunkReader {
679666 let n_lines_is_correct = df. height ( ) == n_lines;
680667
681668 // Check malformed
682- if df. height ( ) > n_lines
683- || ( df. height ( ) < n_lines && self . parse_options . comment_prefix . is_none ( ) )
684- {
669+ if !n_lines_is_correct {
685670 // Note: in case data is malformed, df.height() is more likely to be correct than n_lines.
686671 let msg = format ! (
687672 "CSV malformed: expected {} rows, actual {} rows, in chunk starting at row_offset {}, length {}" ,
@@ -722,31 +707,3 @@ impl ChunkReader {
722707 Ok ( ( df, height) )
723708 }
724709}
725-
726- struct CountLinesWithComments {
727- quote_char : Option < u8 > ,
728- eol_char : u8 ,
729- comment_prefix : CommentPrefix ,
730- }
731-
732- impl CountLinesWithComments {
733- fn opt_new ( parse_options : & CsvParseOptions ) -> Option < Self > {
734- parse_options
735- . comment_prefix
736- . clone ( )
737- . map ( |comment_prefix| CountLinesWithComments {
738- quote_char : parse_options. quote_char ,
739- eol_char : parse_options. eol_char ,
740- comment_prefix,
741- } )
742- }
743-
744- fn count_lines ( & self , bytes : & [ u8 ] ) -> PolarsResult < usize > {
745- count_rows_from_slice_raw (
746- bytes,
747- self . quote_char ,
748- Some ( & self . comment_prefix ) ,
749- self . eol_char ,
750- )
751- }
752- }
0 commit comments