多线程视角

全局变量 + Heap -> Global变量

Stack -> 私有变量

Stack是每一个线程私有的，Heap和全局变量是各个线程共享的。 -> 每一个状态，相当于选定 全局状态 + 执行的线程的私有状态（局部状态），进行执行，并得到结果。

由于并发程序是并发执行的，具有不确定性。导致状态机从一个链表，变成了一棵多叉树，复杂度飙升！！！

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdatomic.h>
#include <assert.h>
#include <unistd.h>
#include <pthread.h>

#define NTHREAD 64
enum { T_FREE = 0, T_LIVE, T_DEAD, };
struct thread {
    int id, status;
    pthread_t thread;
    void (*entry)(int);
};

struct thread tpool[NTHREAD], *tptr = tpool;

void *wrapper(void *arg) {
    struct thread *thread = (struct thread *)arg;
    thread->entry(thread->id);
    return NULL;
}

void create(void *fn) {
    assert(tptr - tpool < NTHREAD);
    *tptr = (struct thread) {
            .id = tptr - tpool + 1,
            .status = T_LIVE,
            .entry = fn,
    };
    pthread_create(&(tptr->thread), NULL, wrapper, tptr);
    ++tptr;
}

void join() {
    for (int i = 0; i < NTHREAD; i++) {
        struct thread *t = &tpool[i];
        if (t->status == T_LIVE) {
            pthread_join(t->thread, NULL);
            t->status = T_DEAD;
        }
    }
}

__attribute__((destructor)) void cleanup() {
    join();
}

she-test.c

#include "thread.h"

int x = 0;

// 事实可以看到，这里共享了这个x昂！！！
void Thello(int id) {
  usleep(id * 100000);
  printf("Hello from thread #%c\n", "123456789ABCDEF"[x++]);
}

int main() {
  for (int i = 0; i < 10; i++) {
    create(Thello);
  }
}

stack-probe.c

#include "thread.h"

// __thread就是类似于Java中的ThreadLocal类，为每个线程创建属于自己的私有堆栈
__thread char *base, *cur; // thread-local variables
__thread int id;

// objdump to see how thread-local variables are implemented
__attribute__((noinline)) void set_cur(void *ptr) { cur = ptr; }
__attribute__((noinline)) char *get_cur()         { return cur; }

// stackoverflow手动栈溢出，然后看看OS给线程的私有栈，分配了多少内存。这里看出来其实是最多分配8192 KB的大小（2^13）
void stackoverflow(int n) {
  set_cur(&n);
  if (n % 1024 == 0) {
    int sz = base - get_cur();
    printf("Stack size of T%d >= %d KB\n", id, sz / 1024);
  }
  stackoverflow(n + 1);
}

void Tprobe(int tid) {
  id = tid;
  base = (void *)&tid;
  stackoverflow(0);
}

int main() {
  setbuf(stdout, NULL);
  for (int i = 0; i < 4; i++) {
    create(Tprobe);
  }
}

提出疑问：这个大小为啥是这样？能否手动设置，使用呢？

多线程特性

原子性

常见假设：当前程序独占处理器执行（根本不成立啊…）

思考：

为什么经典的i++类似的并行会出问题，但是，printf，不会打印到一半，突然暴毙？

void Ta(){while(1){printf("aaaaa")}};
void Tb(){while(1){printf("bbb")}};

int main() {
  create(Ta);
  create(Tb);
}

man 3 printf -> / thread（搜索和thread相关的）

发现系统库早就考虑了昂！！！

字符串没有相互交集，可以自己测的。去查手册验证，确实printf是线程安全的。

原子性和其实现：

顺序

编译器会去优化代码昂！！！

顺序的丧失：

如果想让编译器不去做这样的优化：

可见性

Mem-ordering.c

#include "thread.h"

int x = 0, y = 0;

atomic_int flag;
#define FLAG atomic_load(&flag)
#define FLAG_XOR(val) atomic_fetch_xor(&flag, val)
#define WAIT_FOR(cond) while (!(cond)) ;

 __attribute__((noinline))
void write_x_read_y() {
  int y_val;
  asm volatile(
    "movl $1, %0;" // x = 1
    "movl %2, %1;" // y_val = y
    : "=m"(x), "=r"(y_val) : "m"(y)
  );
  printf("%d ", y_val);
}

 __attribute__((noinline))
void write_y_read_x() {
  int x_val;
  asm volatile(
    "movl $1, %0;" // y = 1
    "movl %2, %1;" // x_val = x
    : "=m"(y), "=r"(x_val) : "m"(x)
  );
  printf("%d ", x_val);
}

void T1(int id) {
  while (1) {
    WAIT_FOR((FLAG & 1));
    write_x_read_y();
    FLAG_XOR(1);
  }
}

void T2() {
  while (1) {
    WAIT_FOR((FLAG & 2));
    write_y_read_x();
    FLAG_XOR(2);
  }
}

// 这里看下视频，两只手，两个开关，并行处理一些东西
void Tsync() {
  while (1) {
    // full barrier guarantees that x and y are definitely written to the memory
    x = y = 0;
    __sync_synchronize(); // full barrier
    usleep(1);            // + delay
    assert(FLAG == 0);
    FLAG_XOR(3);
    // T1 and T2 clear 0/1-bit, respectively
    WAIT_FOR(FLAG == 0);
    printf("\n"); fflush(stdout);
  }
}

int main() {
  create(T1);
  create(T2);
  create(Tsync);
}