from Hacker News

So, How Fast Is Rust Anyway

by naveed125 on 3/18/25, 4:32 AM with 4 comments

  • by naveed125 on 3/18/25, 4:32 AM

  • by jmillikin on 3/18/25, 6:04 AM

    I flagged this for being LLM-generated garbage; original comment below. Any readers interested in benchmarking programming language implementations should visit https://benchmarksgame-team.pages.debian.net/benchmarksgame/... instead.

    ---

    The numbers in the table for C vs Rust don't make sense, and I wasn't able to reproduce them locally. For a benchmark like this I would expect to see nearly identical performance for those two languages.

    Benchmark sources:

    https://github.com/naveed125/rust-vs/blob/6db90fec706c875300...

    https://github.com/naveed125/rust-vs/blob/6db90fec706c875300...

    Benchmark process and results:

      $ gcc --version
      gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
      $ gcc -O2 -static -o bench-c-gcc benchmark.c
      $ clang --version
      Ubuntu clang version 14.0.0-1ubuntu1.1
      $ clang -O2 -static -o bench-c-clang benchmark.c
      $ rustc --version
      rustc 1.81.0 (eeb90cda1 2024-09-04)
      $ rustc -C opt-level=2 --target x86_64-unknown-linux-musl -o bench-rs benchmark.rs
    
      $ taskset -c 1 hyperfine --warmup 1000 ./bench-c-gcc
      Benchmark 1: ./bench-c-gcc
        Time (mean ± σ):       3.2 ms ±   0.1 ms    [User: 2.7 ms, System: 0.6 ms]
        Range (min … max):     3.2 ms …   4.1 ms    770 runs
    
      $ taskset -c 1 hyperfine --warmup 1000 ./bench-c-clang
      Benchmark 1: ./bench-c-clang
        Time (mean ± σ):       3.5 ms ±   0.1 ms    [User: 3.0 ms, System: 0.6 ms]
        Range (min … max):     3.4 ms …   4.8 ms    721 runs
    
      $ taskset -c 1 hyperfine --warmup 1000 ./bench-rs
      Benchmark 1: ./bench-rs
        Time (mean ± σ):       5.1 ms ±   0.1 ms    [User: 2.9 ms, System: 2.2 ms]
        Range (min … max):     5.0 ms …   7.1 ms    507 runs
    
    
    Those numbers also don't make sense, but in a different way. Why is the Rust version so much slower, and why does it spend the majority of its time in "system"?

    Oh, it's because benchmark.rs is performing a dynamic memory allocation for each key. The C version uses a buffer on the stack, with fixed-width keys. Let's try doing the same in the Rust version:

      --- benchmark.rs
      +++ benchmark.rs
      @@ -38,22 +38,22 @@
       }
     
       // Generates a random 8-character string
      -fn generate_random_string(rng: &mut Xorshift) -> String {
      +fn generate_random_string(rng: &mut Xorshift) -> [u8; 8] {
           const CHARSET: &[u8] = b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
      -    let mut result = String::with_capacity(8);
      +    let mut result = [0u8; 8];
       
      -    for _ in 0..8 {
      +    for ii in 0..8 {
               let rand_index = (rng.next() % 62) as usize;
      -        result.push(CHARSET[rand_index] as char);
      +        result[ii] = CHARSET[rand_index];
           }
       
           result
       }
       
       // Generates `count` random strings and tracks their occurrences
      -fn generate_random_strings(count: usize) -> HashMap<String, u32> {
      +fn generate_random_strings(count: usize) -> HashMap<[u8; 8], u32> {
           let mut rng = Xorshift::new();
      -    let mut string_counts: HashMap<String, u32> = HashMap::new();
      +    let mut string_counts: HashMap<[u8; 8], u32> = HashMap::with_capacity(count);
       
           for _ in 0..count {
               let random_string = generate_random_string(&mut rng);
    
    Now it's spending all its time in userspace again, which is good:

      $ taskset -c 1 hyperfine --warmup 1000 ./bench-rs
      Benchmark 1: ./bench-rs
        Time (mean ± σ):       1.5 ms ±   0.1 ms    [User: 1.3 ms, System: 0.2 ms]
        Range (min … max):     1.4 ms …   3.2 ms    1426 runs
     
    ... but why is it twice as fast as the C version?

    ---

    I go to look in benchmark.c, and my eyes are immediately drawn to this weird bullshit:

      // Xorshift+ state variables (64-bit)
      uint64_t state0, state1;
    
      // Xorshift+ function for generating pseudo-random 64-bit numbers
      uint64_t xorshift_plus() {
          uint64_t s1 = state0;
          uint64_t s0 = state1;
          state0 = s0; 
          s1 ^= s1 << 23; 
          s1 ^= s1 >> 18; 
          s1 ^= s0; 
          s1 ^= s0 >> 5;
          state1 = s1; 
          return state1 + s0; 
      }
    
    That's not simply a copy of the xorshift+ example code on Wikipedia. Is there any human in the world who is capable of writing xorshift+ but is also dumb enough to put its state into global variables? I smell an LLM.

    A rough patch to put the state into something the compiler has a hope of optimizing:

      --- benchmark.c
      +++ benchmark.c
      @@ -18,25 +18,35 @@
       StringNode *hashTable[HASH_TABLE_SIZE]; // Hash table for storing unique strings
       
       // Xorshift+ state variables (64-bit)
      -uint64_t state0, state1;
      +struct xorshift_state {
      +       uint64_t state0, state1;
      +};
       
       // Xorshift+ function for generating pseudo-random 64-bit numbers
      -uint64_t xorshift_plus() {
      -    uint64_t s1 = state0;
      -    uint64_t s0 = state1;
      -    state0 = s0;
      +uint64_t xorshift_plus(struct xorshift_state *st) {
      +    uint64_t s1 = st->state0;
      +    uint64_t s0 = st->state1;
      +    st->state0 = s0;
           s1 ^= s1 << 23;
           s1 ^= s1 >> 18;
           s1 ^= s0;
           s1 ^= s0 >> 5;
      -    state1 = s1;
      -    return state1 + s0;
      +    st->state1 = s1;
      +    return s1 + s0;
       }
       
       // Function to generate an 8-character random string
       void generate_random_string(char *buffer) {
      +    uint64_t timestamp = (uint64_t)time(NULL) * 1000;
      +    uint64_t state0 = timestamp ^ 0xDEADBEEF;
      +    uint64_t state1 = (timestamp << 21) ^ 0x95419C24A637B12F;
      +    struct xorshift_state st = {
      +        .state0 = state0,
      +        .state1 = state1,
      +    };
      +
           for (int i = 0; i < STRING_LENGTH; i++) {
      -        uint64_t rand_value = xorshift_plus() % 62;
      +        uint64_t rand_value = xorshift_plus(&st) % 62;
       
               if (rand_value < 10) { // 0-9
                   buffer[i] = '0' + rand_value;
      @@ -113,11 +123,6 @@
       }
       
       int main() {
      -    // Initialize random seed
      -    uint64_t timestamp = (uint64_t)time(NULL) * 1000;
      -    state0 = timestamp ^ 0xDEADBEEF; // Arbitrary constant
      -    state1 = (timestamp << 21) ^ 0x95419C24A637B12F; // Arbitrary constant
      -
           double total_time = 0.0;
       
           // Run 3 times and measure execution time
      
    
    and the benchmarks now make slightly more sense:

      $ taskset -c 1 hyperfine --warmup 1000 ./bench-c-gcc
      Benchmark 1: ./bench-c-gcc
        Time (mean ± σ):       1.1 ms ±   0.1 ms    [User: 1.1 ms, System: 0.1 ms]
        Range (min … max):     1.0 ms …   1.8 ms    1725 runs
      
      $ taskset -c 1 hyperfine --warmup 1000 ./bench-c-clang
      Benchmark 1: ./bench-c-clang
        Time (mean ± σ):       1.0 ms ±   0.1 ms    [User: 0.9 ms, System: 0.1 ms]
        Range (min … max):     0.9 ms …   1.4 ms    1863 runs
    
    But I'm going to stop trying to improve this garbage, because on re-reading the article, I saw this:

      > Yes, I absolutely used ChatGPT to polish my code. If you’re judging me for this,
      > I’m going to assume you still churn butter by hand and refuse to use calculators.
      > [...]
      > I then embarked on the linguistic equivalent of “Google Translate for code,”
    
    Ok so it's LLM-generated bullshit, translated into other languages either by another LLM, or by a human who doesn't know those languages well enough to notice when the output doesn't make any sense.