Bean Li

插入排序 C实现

Bean Li — 2023-06-10T22:00:00+00:00

前言

复习《算法：C语言实现》一书，本文实现了插入排序。插入排序的思路和人打牌的时候，摸牌，把新摸到的牌插入到指定位置很相似。

#ifndef _SORT_BASE_H_
#define _SORT_BASE_H_

typedef int Item ;

#define Key(A) (A)
#define less(A,B) (Key(A) < Key(B))
#define exch(A,B) { Item tmp = A; A=B ; B=tmp;}
#define compexch(A, B) if(less(B, A)) exch(A,B)


#endif

#ifndef _INSERTSORT_H
#define _INSERTSORT_H

#include "sort_base.h"

void insertsort_slow(Item a[], int l , int r)
{
    int i ,j ;
    for(i = l+1 ; i <=r ; i++)
        for(j = i; j > l; j--)
            compexch(a[j-1], a[j]);
}

void insertsort(Item a[], int l,  int r)
{
    int i ;

    //put minimal value to the leftmost position , as sentinel key
    for(i = r ; i  > l; i--)
        compexch(a[i-1], a[i]);

    for( i = l+2 ; i <=r ; i++)
    {
        int j = i ;
        Item v = a[i] ;

        /* no need add j-1 >= l condition, because a[0] is alreaay the most minimal value already*/
        /* use while for break immediately to skip useless compare*/
        while(less(v, a[j-1]))
        {
            a[j] = a[j-1] ;
            j-- ;
        }
        a[j] = v ;
    }
}

#endif

insertsort_slow 这个版本，更容易理解，但是效率不高。

内层循环是从右往左比较，而左边的子序列实际上已经是有序的了，当碰到关键字不大于整备插入的数据项的关键字的时候，其实没有继续比较了，缺少了break机制，增加了无谓的比较。
j > l 这个比较，大部分情况下是无用的，只有要插入的元素是最小的并且要插入数组的最前端的时候，该条件才有真正的意义。通常的一个改进思路是把最小的元素放在a[0] ,作为哨兵，只需要对a[1]~a[N]的元素进行排序即可。这种改进思路，可以大面积减少比较。

通过上述思路分析，将insertsort改进成最终的版本。

快速排序 C实现

Bean Li — 2023-06-05T10:29:00+00:00

前言

最近在复习数据结构，参考的是《算法：C语言实现》。本文实现了quicksort，对于某些特殊序列，quicksort恶化的情况，采用了三者取中的方式，防止快排性能恶化。

对于选择整个输入中第K小的元素，可以不必完成整个排序之后，再来取对应位置的值，可以参考快排的思路，快速获得整个序列中第Kth的元素。同时实现了递归和非递归的版本。

#ifndef _SORT_BASE_H_
#define _SORT_BASE_H_

typedef int Item ;

#define Key(A) (A)
#define less(A,B) (Key(A) < Key(B))
#define exch(A,B) { Item tmp = A; A=B ; B=tmp;}
#define compexch(A, B) if(less(B, A)) exch(A,B)


#endif

#ifndef _QUICKSORT_H_
#define _QUICKSORT_H_

#include "sort_base.h"
#include "insertsort.h"

#define CUTOFF (8)

int partition(Item a[], int l, int r)
{
    int i = l-1;
    int j = r ;
    Item v = a[r];
    for(;;)
    {
        while(less(a[++i],v));
        while(less(v, a[--j]))
        {
            if(j == l)
                break;
        }
        if(i >=j)
            break;
        exch(a[i], a[j]);
    }
    exch(a[i], a[r]);
    return i ;
}

void _quicksort(Item a[], int l, int r)
{
    if(r - l <=CUTOFF)
        return ;

    /* find the middle element of (a[l], a[mid] , a[r])*/
    exch(a[(l+r)/2], a[r-1]);

    compexch(a[l], a[r-1]);
    compexch(a[l], a[r]);
    compexch(a[r-1], a[r]);

    /* a[r-1] is a[mid]
     * a[l] < a[r-1]
     * a[r-1] < a[r]
     * a[l] and a[r] no need to appear in partition function
     */
    int i = partition(a, l+1, r -1);
    _quicksort(a, l , i-1);
    _quicksort(a, i+1 , r);

}


void quicksort(Item a[], int l , int r)
{
    _quicksort(a, l , r);
    insertsort(a, l , r);
}


/* recursive version of find kth element*/
Item find_kth_element(Item a[], int l, int r, int k)
{
    int pivot = partition(a, l , r);
    if (pivot == k)
        return a[pivot];

    if (pivot > k)
        return find_kth_element(a, l, pivot-1, k);
    else
        return find_kth_element(a, pivot+1, r, k);

}

/* non-recursive version of find kth element*/
Item find_kth_element_n(Item a[], int l, int r, int k)
{
    while(l < r)
    {
        int pivot = partition(a, l , r);
        if (pivot == k)
            return a[pivot];
        if (pivot > k)
            r  = pivot - 1;
        else
            l =  pivot + 1 ;
    }
    
    return a[l];
}

#endif

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include "shellsort.h"
#include "quicksort.h"


std::vector<uint32_t> generate_test_vec(size_t n) {
    auto test_vec = std::vector<uint32_t> {};
    while (test_vec.size() < n) {
        test_vec.push_back((uint32_t) std::rand() %100000);
    }
    return test_vec;
}

int cmp_int(const void* a, const void *b)
{
    return ( *(int*)a - *(int*)b );
}

int main() {
    for (int round = 0; round < 3; round++) {
        std::printf("round %d:\n", round);

        auto test_vec1 = generate_test_vec(2000000);
        auto test_vec2 = test_vec1;
        auto test_vec3 = test_vec1;
        auto test_vec4 = test_vec1;

        auto start_time1 = std::clock();
        //std::sort(test_vec1.begin(), test_vec1.end());
        qsort(test_vec1.data(), test_vec1.size(), 4, cmp_int);
        std::printf("qsort(glibc) time: %.3lf sec\n", (std::clock() - start_time1) * 1.0 / CLOCKS_PER_SEC);

        auto start_time2 = std::clock();
        shellsort_2((Item*)test_vec2.data(), 0, test_vec2.size()-1);
        std::printf("shellsort time: %.3lf sec\n", (std::clock() - start_time2) * 1.0 / CLOCKS_PER_SEC);

        auto start_time3 = std::clock();
        quicksort((Item*)test_vec3.data(), 0, test_vec3.size()-1);
        std::printf("quicksort time: %.3lf sec\n", (std::clock() - start_time3) * 1.0 / CLOCKS_PER_SEC);

        auto start_time4 = std::clock();
        std::sort(test_vec4.begin(), test_vec4.end());
        std::printf("std::sort time: %.3lf sec\n", (std::clock() - start_time4) * 1.0 / CLOCKS_PER_SEC);

        assert(test_vec1 == test_vec2);
        assert(test_vec1 == test_vec3);
        assert(test_vec1 == test_vec4);

    }
}

ROG-Manjaro C/sort » ./test
round 0:
qsort(glibc) time: 0.556 sec
shellsort time: 0.898 sec
quicksort time: 0.454 sec
std::sort time: 0.989 sec
round 1:
qsort(glibc) time: 0.555 sec
shellsort time: 0.800 sec
quicksort time: 0.393 sec
std::sort time: 1.064 sec
round 2:
qsort(glibc) time: 0.507 sec
shellsort time: 0.900 sec
quicksort time: 0.498 sec
std::sort time: 1.145 sec

C++ 调用Rust

Bean Li — 2023-06-05T10:29:00+00:00

前言

软件开发中，没有银弹，我们有时需要多种语言互相调用支援，发挥各自的优势。本文介绍Rust的FFI 外部函数接口。

本文重点介绍C或者C++调用Rust library的方法。

Rust在实现std::slice::sort_unstable的时候，用了一种新的快速排序变种PDQSort，相对其它语言里面普遍用到的IntroSort有较大的性能提升，我们偷个懒，不自己实现C的pdqsort，调用Rust的pdqsort，作为本次练习的任务。

Rust 侧

第一步：

cargo new pdqsort --lib

执行完毕后，我们得到了：

ROG-Manjaro Rust/pdqsort ‹master*› » tree
.
├── Cargo.lock
├── Cargo.toml
└── src
    └── lib.rs

2 directories, 3 files

我们修改src/lib.rs

#[no_mangle]
pub unsafe extern fn rust_u32sort(elements: *mut u32, size: u64) {
    let elements = std::slice::from_raw_parts_mut(elements, size as usize);
    elements.sort_unstable();
}

同时修改Cargo.toml :

[lib]
crate-type = ["cdylib"]

crate-type = ["cdylib"]会创建一个动态链接的库。可查看 Cargo 文档的动态或静态库，了解更多信息.

cdylib是在 RFC 1510 中引入，并改善了现有的dylib文件，减小其大小，和导出更少符号。

然后我们可以通过调用：

ROG-Manjaro Rust/pdqsort ‹master*› » cargo build --release
   Compiling pdqsort v0.1.0 (/home/manu/CODE/Rust/pdqsort)
    Finished release [optimized] target(s) in 1.49s
ROG-Manjaro Rust/pdqsort ‹master*› » tree
.
├── Cargo.lock
├── Cargo.toml
├── src
│   └── lib.rs
└── target
    ├── CACHEDIR.TAG
    └── release
        ├── build
        ├── deps
        │   ├── libpdqsort.so
        │   └── pdqsort.d
        ├── examples
        ├── incremental
        ├── libpdqsort.d
        └── libpdqsort.so

8 directories, 8 files

可以看出我们已经可以编译出了libpdqsort.so。

如果想用静态链接：

[lib]
crate-type = ["staticlib"]

会编译出来如下内容：

ROG-Manjaro Rust/pdqsort ‹master*› » tree
.
├── Cargo.lock
├── Cargo.toml
├── src
│   └── lib.rs
└── target
    ├── CACHEDIR.TAG
    └── release
        ├── build
        ├── deps
        │   ├── libpdqsort-4b11be617319d092.a
        │   └── pdqsort-4b11be617319d092.d
        ├── examples
        ├── incremental
        ├── libpdqsort.a
        └── libpdqsort.d

C++ 侧

C++侧，我们写测试代码，即对100万个unsigned int数进行排序。

#include 
#include 
#include 
#include 
#include 
#include 
#include 

extern "C" int rust_u32sort(uint32_t* elements, uint64_t size);

std::vector generate_test_vec(size_t n) {
    auto test_vec = std::vector {};
    while (test_vec.size() < n) {
        test_vec.push_back((uint32_t) std::rand());
    }
    return test_vec;
}

int main() {
    for (int round = 0; round < 3; round++) {
        std::printf("round %d:\n", round);

        auto test_vec1 = generate_test_vec(10000000);
        auto test_vec2 = test_vec1;

        auto start_time1 = std::clock();
        std::sort(test_vec1.begin(), test_vec1.end());
        std::printf("std::sort time: %.3lf sec\n", (std::clock() - start_time1) * 1.0 / CLOCKS_PER_SEC);

        auto start_time2 = std::clock();
        rust_u32sort(test_vec2.data(), test_vec2.size());
        std::printf("rust_u32sort time: %.3lf sec\n", (std::clock() - start_time2) * 1.0 / CLOCKS_PER_SEC);

        assert(test_vec2 == test_vec1);
    }
}

编译：

 动态链接：
 c++  -std=c++20 -O3  test.cpp -L ../../Rust/pdqsort/target/release/ -lpdqsort -o sort   
 静态链接：
 c++  -std=c++20 -O3  test.cpp -L ../../Rust/pdqsort/target/release/ -lpdqsort -o sort 

执行：

动态链接：
-----------
LD_LIBRARY_PATH=./target/release ../../C++/pdqsort/sort
round 0:
std::sort time: 1.746 sec
rust_u32sort time: 0.797 sec
round 1:
std::sort time: 1.786 sec
rust_u32sort time: 0.841 sec
round 2:
std::sort time: 1.829 sec
rust_u32sort time: 0.834 sec


静态链接：
---------
ROG-Manjaro C++/pdqsort » ./sort
round 0:
std::sort time: 1.787 sec
rust_u32sort time: 0.721 sec
round 1:
std::sort time: 1.848 sec
rust_u32sort time: 0.776 sec
round 2:
std::sort time: 1.831 sec
rust_u32sort time: 0.745 sec

Summary

结果上可以看到Rust的排序比C++ std::sort快了一倍多，结果比较出乎意料。目前在读PDQSort的代码，分析性能比传统快排更优的原因是使用了一种新的快排变种BlockQuickSort，这个变种算法可以较大改善传统快排中的分支预言失败的情况，具体实现还没有读完，后续再补充上。

附上BlockQuickSort的论文，2016年出的，BlockQuicksort: Avoiding Branch Mispredictions in Quicksort.

我们将上述测试放到物理机上跑：

[david@david-latitude3510 pdqsort]$ LD_LIBRARY_PATH=./target/release perf stat ../../C++/pdqsort/sort
round 0:
std::sort time: 0.782 sec
round 1:
std::sort time: 0.785 sec
round 2:
std::sort time: 0.779 sec

 Performance counter stats for '../../C++/pdqsort/sort':

          3,029.93 msec task-clock:u                     #    1.000 CPUs utilized
                 0      context-switches:u               #    0.000 /sec
                 0      cpu-migrations:u                 #    0.000 /sec
            11,694      page-faults:u                    #    3.859 K/sec
     9,780,539,711      cycles:u                         #    3.228 GHz                         (74.95%)
     7,320,156,912      instructions:u                   #    0.75  insn per cycle              (75.02%)
     1,664,764,519      branches:u                       #  549.440 M/sec                       (75.05%)
       305,855,559      branch-misses:u                  #   18.37% of all branches             (74.97%)

       3.030752533 seconds time elapsed

       2.800187000 seconds user
       0.133345000 seconds sys


[david@david-latitude3510 pdqsort]$ LD_LIBRARY_PATH=./target/release perf stat ../../C++/pdqsort/sort
round 0:
rust_u32sort time: 0.342 sec
round 1:
rust_u32sort time: 0.341 sec
round 2:
rust_u32sort time: 0.343 sec

 Performance counter stats for '../../C++/pdqsort/sort':

          1,619.17 msec task-clock:u                     #    1.000 CPUs utilized
                 0      context-switches:u               #    0.000 /sec
                 0      cpu-migrations:u                 #    0.000 /sec
            11,702      page-faults:u                    #    7.227 K/sec
     5,300,322,850      cycles:u                         #    3.273 GHz                         (74.99%)
     9,081,461,937      instructions:u                   #    1.71  insn per cycle              (74.99%)
     1,185,526,644      branches:u                       #  732.182 M/sec                       (74.99%)
        69,121,012      branch-misses:u                  #    5.83% of all branches             (75.02%)

       1.619959094 seconds time elapsed

       1.566298000 seconds user
       0.049854000 seconds sys

我们可以看到branch-misses，对于改进后的快排，只有5.83%的分支预测失败，而传统的快排，18.37%的branch-misses。

参考文献

Rust的排序比C++快了一倍

C++ pdqsort排序

关于logrotate

Bean Li — 2022-10-26T10:29:00+00:00

前言

最近帮忙处理了/var/log/ceph/下的文件不能rotate的问题，分析了原因，把关于logrotate的相关的知识点梳理了下，特记录为本文。在合作伙伴的环境里面，因为ceph.audit.log 可能会产生出比较大的文件，ceph相关的rotate配置被改成了这个样子。

/var/log/ceph/ceph.audit.log {
    rotate 14
    daily
    maxsize 5G
    compress
    sharedscripts
    postrotate
        killall -q -1 ceph-mon || true
    endscript
    missingok
    notifempty
    su root ceph
}
/var/log/ceph/*.log {
    rotate 7
    daily
    compress
    sharedscripts
    postrotate
        killall -q -1 ceph-mon ceph-mgr ceph-mds ceph-fuse radosgw rbd-mirror || true
        killall -q -1 -r ceph-osd.* || true
    endscript
    missingok
    notifempty
    su root ceph
}

合作伙伴报故障，说ceph-osd等日志都没有回滚。

如何检查conf文件

logrotate提供了-d选项，可以用来检查conf文件是否有错误。

root@node165:/etc/logrotate.d# logrotate --help
Usage: logrotate [OPTION...] 
  -d, --debug               Don't do anything, just test (implies -v)
  -f, --force               Force file rotation
  -m, --mail=command        Command to send mail (instead of `/usr/bin/mail')
  -s, --state=statefile     Path of state file
  -v, --verbose             Display messages during rotation
      --version             Display version information

Help options:
  -?, --help                Show this help message
      --usage               Display brief usage message

我们检查下旧的conf文件：

root@node165:/etc/logrotate.d# logrotate -d ceph-common
reading config file ceph-common
error: ceph-common:14 duplicate log entry for /var/log/ceph/ceph.audit.log

Handling 2 logs

rotating pattern: /var/log/ceph/ceph.audit.log  after 1 days (14 rotations)
empty log files are not rotated, log files >= 5368709120 are rotated earlier, old logs are removed
switching euid to 0 and egid to 64045
considering log /var/log/ceph/ceph.audit.log
  log does not need rotating
not running postrotate script, since no logs were rotated
switching euid to 0 and egid to 0

rotating pattern: /var/log/ceph/*.log  after 1 days (7 rotations)
empty log files are not rotated, old logs are removed
No logs found. Rotation not needed.
root@node165:/etc/logrotate.d#

上来就检查到了error。根据wildcard的匹配，ceph.audt.log同时匹配第一条的规则和第二条的规则。这种情况下会怎么样呢。

如果发现有重复的话，本条规则就会放弃，所以/var/log/ceph/*.log 这条规则被放弃，从而，除了ceph.audit.log会正常rotate外，其他的所有日志都不会rotate，只会越变越大，和现场的情况一致。

很有意思的是，如果将两个规则调换下顺序，所有的日志都会rotate，只是ceph.audit.log 按照/var/log/ceph/*.log的规则去rotate，而不是自己的特有规则去rotate。如果文件存在重复，那么后面出现的规则会被抛弃。

触发时机

logrotate什么时候会检查？如果满足条件，什么时候会被执行？

root@node165:/etc/logrotate.d# cat /etc/crontab
# /etc/crontab: system-wide crontab
# Unlike any other crontab you don't have to run the `crontab'
# command to install the new version when you edit this file
# and files in /etc/cron.d. These files also have username fields,
# that none of the other crontabs do.

SHELL=/bin/sh
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin

# m h dom mon dow user  command
17 *    * * *   root    cd / && run-parts --report /etc/cron.hourly
25 6    * * *   root    test -x /usr/sbin/anacron || ( cd / && run-parts --report /etc/cron.daily )
47 6    * * 7   root    test -x /usr/sbin/anacron || ( cd / && run-parts --report /etc/cron.weekly )
52 6    1 * *   root    test -x /usr/sbin/anacron || ( cd / && run-parts --report /etc/cron.monthly )
#
root@node165:/etc/logrotate.d#

我们看到crontab中有

hourly
daily
weekly
monthly

四个等级的cron job，其中在/etc/cron.hourly目录下：

root@node165:/etc/cron.hourly# ll
total 24
drwxr-xr-x   2 root root  4096 Oct 11 14:20 ./
drwxr-xr-x 134 root root 12288 Oct 24 20:35 ../
-rwxr-xr-x   1 root root   372 May  6  2015 logrotate*
-rw-r--r--   1 root root   102 Apr  6  2016 .placeholder
root@node165:/etc/cron.hourly#
root@node165:/etc/cron.hourly# cat logrotate
#!/bin/sh

# Clean non existent log file entries from status file
cd /var/lib/logrotate
test -e status || touch status
head -1 status > status.clean
sed 's/"//g' status | while read logfile date
do
    [ -e "$logfile" ] && echo "\"$logfile\" $date"
done >> status.clean
mv status.clean status

test -x /usr/sbin/logrotate || exit 0
/usr/sbin/logrotate /etc/logrotate.conf
root@node165:/etc/cron.hourly#

我们看到每个小时的17分，会执行hourly的任务，其中就有logrotate的相关的任务。所以每个小时的17分，会执行logrotate的相关的任务，logrotate会检查是否满足回滚日志的条件，如果满足的话，会执行相关的命令。

logrotate状态的记录

如果我要问，某个文件上一次rotate是什么时间？这个信息如何查找？

/var/lib/logrotate/status

这个文件记录了logrotate的状态：

root@node164:/var/lib/logrotate# cat status  |grep ceph
"/var/log/ceph/ceph-osd.4.log" 2022-10-17-0:17:1
"/var/log/ceph/ceph-mds.vigex.log" 2022-10-21-17:0:0
"/var/log/ceph/ceph-mds.wbjal.log" 2022-10-17-15:0:0
"/var/log/ceph/ceph-mon.czlyb.log" 2022-10-17-15:0:0
"/var/log/ceph/ceph-mgr.czlyb.log" 2022-10-21-16:17:1
"/var/log/ceph/ceph-client.radosgw.0.log" 2022-10-17-0:17:1
"/var/log/ceph/ceph-osd.6.log" 2022-10-24-5:17:1
"/var/log/ceph/ceph-mgr.jmsac.log" 2022-10-24-0:17:1
"/var/log/ceph/ceph-mon.jmsac.log" 2022-10-21-16:0:0
"/var/log/ceph/ceph-osd.10.log" 2022-10-22-13:17:1
"/var/log/ceph/ceph-osd.8.log" 2022-10-22-6:17:1
"/var/log/ceph/ceph-osd.1.log" 2022-10-17-0:17:1
"/var/log/ceph/ceph.audit.log" 2022-10-24-14:17:1
"/var/log/ceph/ceph-mds.ywjxq.log" 2022-10-17-16:0:0
"/var/log/ceph/ceph-osd.3.log" 2022-10-17-0:17:1
"/var/log/ceph/ceph-osd.5.log" 2022-10-17-0:17:1
"/var/log/ceph/ceph-osd.7.log" 2022-10-22-6:17:1
"/var/log/ceph/ceph-osd.11.log" 2022-10-22-0:17:1
"/var/log/ceph/ceph-osd.9.log" 2022-10-22-15:17:1
"/var/log/ceph/ceph-osd.0.log" 2022-10-17-0:17:1
"/var/log/ceph/ceph-mon.odkzh.log" 2022-10-17-0:17:1
"/var/log/ceph/ceph-mgr.odkzh.log" 2022-10-18-0:17:1
"/var/log/ceph/ceph.log" 2022-10-24-0:17:1
"/var/log/ceph/ceph-mds.bihnq.log" 2022-10-17-1:17:1
"/var/log/ceph/ceph-osd.2.log" 2022-10-17-0:17:1

通过查看这个文件，我们可以了解上次logrotate发生的事件。

某些有意义的参数

logrotate的参数很多，有些参数非常有意思：

daily

这个参数的意思是按天回滚，日志一天存放一个文件，该参数一般和如下参数配合使用：

rotate 7
daily
compress

每天日志回滚一次，最多保留7分，旧的日志会被压缩。这样的话，基本上会保留一周的日志。但是有些时候，一周之前的日志回滚掉，会给排查问题带来很大的麻烦，比如需要查18天前的日志，那么就来不及了，因为日志已经被删除掉了。

minsize 和maxsize

这两个参数非常有用。logrotate是为了保护磁盘空间，防止日志永不删除，导致系统盘或者日志盘被写满，从而导致异常发生。但是这个世界的本质就是妥协：

一方面，当我们需要通过日志排查问题的时候，我们希望尽可能多的日志存放在磁盘上
另一方面，我们又担心存放太多的日志文件会导致系统盘或者日志盘被占满，引发故障
```
进退维谷，左右为难
```
除了daily的选项外，logrotate提供了minsize这个非常有用的参数。这个参数是对logrotate一票否定的参数，即要想rotate，至少log的大小要超过minsize，这个参数对那些平时比较静默的日志文件比较友好。如果一个日志文件，平时基本不打印日志，那么纵然设置了daily参数，到了每个小时的17分的时候，依然不会回滚，因为日志的size低于minsize，无需回滚。 minsize就给出了一个潜在的可能，即可能存放超过指定天数（比如7天）的日志量，如果某些日志文件平时不经常打印的情况下，如下所示：
```
rotate 7
daily
compress
minsize 50M
maxsize 2G
```
maxsize则提供了另外一层的可能，即一天的事件还未到，就产生了比较大的日志量，单纯靠daily这个参数控制，为时已晚，可能还没等到回滚，日志已经打爆了系统盘或者日志盘。maxsize是一个对logrotate一票决定的参数，即只要log的大小超过maxsize，就会及时回滚。

查看NTP的状态

Bean Li — 2022-06-26T10:29:00+00:00

如何查看当前当前NTP状态

ntpstat

>> NODE: 20.20.20.222 <<
synchronised to local net at stratum 11
   time correct to within 10 ms
   polling server every 16 s

>> NODE: 20.20.20.223 <<
synchronised to NTP server (20.20.20.222) at stratum 12
   time correct to within 12 ms
   polling server every 16 s
   
>> NODE: 20.20.20.224 <<
synchronised to NTP server (20.20.20.222) at stratum 12
   time correct to within 11 ms
   polling server every 16 s

>> NODE: 20.20.20.226 <<
synchronised to NTP server (20.20.20.222) at stratum 12
   time correct to within 12 ms
   polling server every 16 s

>> NODE: 20.20.20.227 <<
synchronised to NTP server (20.20.20.222) at stratum 12
   time correct to within 12 ms
   polling server every 16 s

首先第一行NTP Server反应了当前机器的NTP Server IP。第二行反应了时间的偏差。第三行表示同步的时间频率。

root@n82-2:~# cat /etc/ntp.conf
...
server 20.20.20.222 burst iburst minpoll 4 maxpoll 4
...

为什么是每16秒查询一次？ poll 的含义是how frequently to query server (in seconds)， 4的含义是2的4次方，即每16秒查询一次。

除了正常的上述情况外，还有：

>> NODE: 20.20.20.222 <<
unsynchronised
   polling server every 8 s

>> NODE: 20.20.20.223 <<
unsynchronised
   polling server every 8 s

>> NODE: 20.20.20.224 <<
unsynchronised
   polling server every 8 s

上述情况出现在，刚刚重启ntpd，尚未同步的阶段。

ntpq -p

root@n82-1:/var/log# ntpq -pn
     remote           refid      st t when poll reach   delay   offset  jitter
==============================================================================
 20.20.20.224    20.20.20.222    12 s   13   16  376    0.142   -0.010   0.202
 20.20.20.226    20.20.20.222    12 s    1   16  377    0.114   -0.003   0.076
 20.20.20.227    20.20.20.222    12 s    1   16  377    0.150   -0.021   0.020
 20.20.20.223    20.20.20.222    12 s    4   16  377    0.136   -0.013  18.163
*127.127.1.0     .LOCL.          10 l   10   16  377    0.000    0.000   0.000
root@n82-1:/var/log#

remote和refid，表明的本机器的远端NTP Server和该远端Server的上级 NTP Server

remote and refid : remote NTP server, and its NTP server

按照我们的NTP部署，如果没有外部的NTP服务器，我们会选择 ceph-mon leader节点，作为内部集群的NTP Server。对于我们本集群而言：

root@n82-1:/var/log# ceph mon dump
dumped monmap epoch 5
epoch 5
fsid bd489dd6-57c1-4878-a279-739624997f24
last_changed 2022-06-09 15:48:42.559018
created 2022-06-09 15:47:51.033044
0: 20.20.20.222:6789/0 mon.mvdfp
1: 20.20.20.223:6789/0 mon.fciae
2: 20.20.20.224:6789/0 mon.vmlcw
3: 20.20.20.226:6789/0 mon.qcdnb
4: 20.20.20.227:6789/0 mon.abtdv

20.20.20.222节点是IP最小的ceph-mon，正常情况下，整个集群的NTP Server 20.20.20.222。

可是，为什么在20.20.20.222节点上，执行ntpq -pn指令，remote这一列，会列出来其他存储节点的IP？

这就不得不提 peer参数了。

peer 20.20.20.224 burst iburst minpoll 4 maxpoll 4

peer中出现的IP和 server 指定的IP，都出现在ntpq -pn的remote列，why？这两者有什么区别？

ntpd service requests the time from another server
ntpd service exchanges the time with a fellow peer

NTP Server 是有层级的概念的，即配置文件中的：

server 127.127.1.0 burst iburst minpoll 4 maxpoll 4
fudge 127.127.1.0 stratum 10

stratum 这个值越低，表示越权威。127.127.1.0 表示local本机作为NTP Server，层级一般定为10。我们看一个集群内的普通节点：

root@n82-2:~# ntpq -pn
     remote           refid      st t when poll reach   delay   offset  jitter
==============================================================================
+20.20.20.224    20.20.20.222    12 s    8   16  376    0.163   -0.008   0.562
+20.20.20.226    20.20.20.222    12 s    7   16  377    0.172    0.046   0.074
+20.20.20.227    20.20.20.222    12 s    8   16  377    0.144    0.006   0.167
+20.20.20.222    LOCAL(0)        11 s    1   16  377    0.104   -0.018   0.122
*20.20.20.222    LOCAL(0)        11 u    6   16  377    0.090    0.014   0.584

可以看到20.20.20.222 stratum 层级为11 ，而出现在peer中的，层级为 12 。

下面介绍下ntpq 各个字段的含义： poll: 这一行，表示周期，即查询周期，前面介绍了 minpoll 4 maxpoll 4 即16秒查询一次。 when：这一行表示，距离上一次查询的时间。 reach：这个是个8进制的表示：

*	377 = 0b1111111， 表示最近8次查询都成功了
*   376 = 0b11111110 除了最近一次查询失败外，其他7次查询都成功了。
*   257 = 0b10101111 表示，最近的4次都成功了，以最近一次为开始算起，第五次和第七次查询失败了

dalay： network round trip time (in milliseconds) 这个是估算的和对应NTP Server（或者peer）之间的网络延迟。因为是虚拟机，所以延迟是0.1ms以上。 offset：本机与remote NTP Server （or peer）的时间差异，我看文档已里面写的单位是ms，但是我自己判断单位是秒。 jitter: 这个是抖动的含义，比较高的抖动，表示要么是remote Server不够稳定精准，或者是网络条件太差。

jitter： difference of successive time values from server (high jitter could be due to an unstable clock or, more likely, poor network performance)

如何校验某个NTP Server是否可用

我们可以通过ntpdate -d指令来校验，他用来诊断，但是并不会更改本地的时间。

       -d     Enable  the  debugging  mode,  in which ntpdate will go through all the steps, but not adjust the local clock. Information useful for general debugging will also be printed.

如果NTP Server网络可达，并且可以响应我们的查询，输入如下：

root@n82-1:/home/btadmin# ntpdate -d 20.20.20.223
25 Jun 15:24:50 ntpdate[919323]: ntpdate 4.2.8p4@1.3265-o Tue Jan  7 15:08:24 UTC 2020 (1)
Looking for host 20.20.20.223 and service ntp
host found : 20.20.20.223
transmit(20.20.20.223)
receive(20.20.20.223)
transmit(20.20.20.223)
receive(20.20.20.223)
transmit(20.20.20.223)
receive(20.20.20.223)
transmit(20.20.20.223)
receive(20.20.20.223)
server 20.20.20.223, port 123
stratum 11, precision -24, leap 00, trust 000
refid [20.20.20.223], delay 0.02576, dispersion 0.00005
transmitted 4, in filter 4
reference time:    e66136bf.6e4bc565  Sat, Jun 25 2022 15:24:47.430
originate timestamp: e66136c9.1d79f7d3  Sat, Jun 25 2022 15:24:57.115
transmit timestamp:  e66136c9.1d70de0a  Sat, Jun 25 2022 15:24:57.115
filter delay:  0.02602  0.02638  0.02591  0.02576
         0.00000  0.00000  0.00000  0.00000
filter offset: -0.00006 -0.00026 0.000019 -0.00001
         0.000000 0.000000 0.000000 0.000000
delay 0.02576, dispersion 0.00005
offset -0.000011

25 Jun 15:24:57 ntpdate[919323]: adjust time server 20.20.20.223 offset -0.000011 sec
root@n82-1:/home/btadmin# echo $?
0

如果网络不可达，或者不能响应查询的请求：

root@n82-1:/home/btadmin# ntpdate -d 20.20.20.222
25 Jun 15:28:20 ntpdate[931881]: ntpdate 4.2.8p4@1.3265-o Tue Jan  7 15:08:24 UTC 2020 (1)
Looking for host 20.20.20.222 and service ntp
host found : 20.20.20.222
transmit(20.20.20.222)
transmit(20.20.20.222)
transmit(20.20.20.222)
transmit(20.20.20.222)
transmit(20.20.20.222)
20.20.20.222: Server dropped: no data
server 20.20.20.222, port 123
stratum 0, precision 0, leap 00, trust 000
refid [20.20.20.222], delay 0.00000, dispersion 64.00000
transmitted 4, in filter 4
reference time:    00000000.00000000  Thu, Feb  7 2036 14:28:16.000
originate timestamp: 00000000.00000000  Thu, Feb  7 2036 14:28:16.000
transmit timestamp:  e661379f.36a98426  Sat, Jun 25 2022 15:28:31.213
filter delay:  0.00000  0.00000  0.00000  0.00000
         0.00000  0.00000  0.00000  0.00000
filter offset: 0.000000 0.000000 0.000000 0.000000
         0.000000 0.000000 0.000000 0.000000
delay 0.00000, dispersion 64.00000
offset 0.000000

25 Jun 15:28:33 ntpdate[931881]: no server suitable for synchronization found
root@n82-1:/home/btadmin# echo $?
1

NTP和hwclock

最后一个话题就是NTP 与hardware clock了。硬件时钟和系统时钟到底啥关系，NTP会在其中发挥什么样的影响。

Linux层面有两个时钟：

system clock
hardware clock

hwclock manual中，有如下一段话：

Automatic Hardware Clock Synchronization by the Kernel
       You should be aware of another way that the Hardware Clock is kept synchronized in some systems.  The Linux kernel has a mode wherein it copies  the System  Time  to  the  Hardware Clock every 11 minutes. This mode is a compile time option, so not all kernels will have this capability.  This is a good mode to use when you are using something sophisticated like NTP to keep your System Clock synchronized. (NTP is a way to keep your System  Time synchronized either to a time server somewhere on the network or to a radio clock hooked up to your system.  See RFC 1305.)

       If the kernel is compiled with the '11 minute mode' option it will be active when the kernel's clock discipline is in a synchronized state.  When in this state, bit 6 (the bit that is set in the mask 0x0040) of the kernel's time_status variable is unset. This value is output as the 'status'  line of the adjtimex --print or ntptime commands.

       It  takes  an  outside influence, like the NTP daemon ntpd(1), to put the kernel's clock discipline into a synchronized state, and therefore turn on  '11 minute mode'.  It can be turned off by running anything that sets the System Clock the old fashioned way, including hwclock --hctosys.  However, if the NTP daemon is still running, it will turn '11 minute mode' back on again the next time it synchronizes the System Clock.

       If  your  system runs with '11 minute mode' on, it may need to use either --hctosys or --systz in a startup script, especially if the Hardware Clock is configured to use the local timescale. Unless the kernel is informed of what timescale the Hardware Clock is using, it may clobber  it  with  the wrong one. The kernel uses UTC by default.

       The  first  userspace  command  to  set  the  System  Clock  informs  the  kernel  what timescale the Hardware Clock is using.  This happens via the persistent_clock_is_local kernel variable.  If --hctosys or --systz is the first, it will set this variable according to the  adjtime  file  or  the appropriate command-line argument.  Note that when using this capability and the Hardware Clock timescale configuration is changed, then a reboot is required to notify the kernel.

大意是说，如果设置了NTP Server，内核每11分钟会调整硬件时钟，使其向系统时钟看起，这被成为 11 miniute mode。看不看起，也是有条件的。

内核中有两个相关的配置选项：

CONFIG_GENERIC_CMOS_UPDATE
CONFIG_RTC_SYSTOHC

如此外，也要关注NTP的状态，我们可以通过timedatectl查看NTP SYNC 状态：

root@SEG-248-82:/home/btadmin# timedatectl
      Local time: Sat 2022-06-25 16:07:59 HKT
  Universal time: Sat 2022-06-25 08:07:59 UTC
        RTC time: Sat 2022-06-25 08:08:00
       Time zone: Asia/Hong_Kong (HKT, +0800)
 Network time on: yes
NTP synchronized: yes
 RTC in local TZ: no

如果ntp同步时间有异常，硬件时钟向系统时钟的同步也会受到影响。

systemd tmpfiles相关的服务

Bean Li — 2022-03-18T10:29:00+00:00

前言

前些日子在排查一个httpd session的问题，发现虚机尽管重启了，但是Firefox浏览器并不logout，机器重启后，Firefox的发送的请求，httpd后台依然会处理。

一般来讲，httpd请求会为每一个client维护一个session，session相关的信息存放位置

CentOS：/tmp/systemd-private-5257097ec71448b0aac566695b533a84-httpd.service-dMlTTI/sessions 这种类似的目录

CentOS 版本：
---------------
[root@node-a tmp]# tree systemd-private-5257097ec71448b0aac566695b533a84-httpd.service-dMlTTI
systemd-private-5257097ec71448b0aac566695b533a84-httpd.service-dMlTTI
└── tmp
    └── sessions
        └── 6d474c39c1606792c92f72973c0f135dc07b6682
2 directories, 1 file

注意为什么CentOS版本的session存放在一个systemd-priviate-开头的奇怪目录下。原因是写在httpd的systemd启动脚本中：

[root@node-a system]# cat httpd.service 
[Unit]
Description=The Apache HTTP Server
After=network.target remote-fs.target nss-lookup.target
Documentation=man:httpd(8)
Documentation=man:apachectl(8)

[Service]
Type=notify
EnvironmentFile=/etc/sysconfig/httpd
ExecStart=/usr/sbin/httpd $OPTIONS -DFOREGROUND
ExecReload=/usr/sbin/httpd $OPTIONS -k graceful
ExecStop=/bin/kill -WINCH ${MAINPID}
# We want systemd to give httpd some time to finish gracefully, but still want
# it to kill httpd after TimeoutStopSec if something went wrong during the
# graceful stop. Normally, Systemd sends SIGTERM signal right after the
# ExecStop, which would kill httpd. We are sending useless SIGCONT here to give
# httpd time to finish.
KillSignal=SIGCONT
PrivateTmp=true

[Install]
WantedBy=multi-user.target

注意上面的PrivateTmp=true，这个选项是一个systemd选项，表示该服务会有一个独立的/tmp目录作为自己的/tmp目录，对于httpd而言：

drwx------  3 root root    4096 Mar 18 13:22 systemd-private-5257097ec71448b0aac566695b533a84-httpd.service-dMlTTI
drwx------  3 root root    4096 Mar 18 12:44 systemd-private-5257097ec71448b0aac566695b533a84-ntpd.service-YDlnOF

很多服务都有类似的行为，比如上面的ntpd的service，毫不意外，ntpd的service脚本中也有：

[Unit]
Description=Network Time Service
After=syslog.target ntpdate.service sntp.service

[Service]
Type=forking
EnvironmentFile=-/etc/sysconfig/ntpd
ExecStart=/usr/sbin/ntpd -u ntp:ntp $OPTIONS
PrivateTmp=true

[Install]
WantedBy=multi-user.target

很奇怪的现象是，CentOS的session文件，有以下行为模式：

reboot -f时，sessoin并不会被清理。reboot -f，然后重建集群，发现很多很奇怪的请求到来，并且被后端正常处理。
reboot的时候，本次的session文件会被清理，但是历史垃圾的session信息不会被清理。

systemd-tmpfiles 相关的服务

Linux操作系统，会有一些临时存放文件的区域，最典型的就应该是/tmp/目录下，该目录存放的文件，一般为临时存放区，不重要，可损失的文件，正式因为这个区域不那么严肃，所以很多进程或者人，会堆放一些文件在该目录下，如果没有任何服务负责管理，很可能会累积非常多的垃圾文件。

Linux的systemd提供了tmpfiles相关的服务：

[root@node-a system]# systemctl status systemd-tmpfiles-*
● systemd-tmpfiles-clean.service - Cleanup of Temporary Directories
   Loaded: loaded (/usr/lib/systemd/system/systemd-tmpfiles-clean.service; static; vendor preset: disabled)
   Active: inactive (dead) since Fri 2022-03-18 13:00:01 CST; 1h 14min ago
     Docs: man:tmpfiles.d(5)
           man:systemd-tmpfiles(8)
  Process: 12340 ExecStart=/usr/bin/systemd-tmpfiles --clean (code=exited, status=0/SUCCESS)
 Main PID: 12340 (code=exited, status=0/SUCCESS)

Mar 18 13:00:01 node-a systemd[1]: Starting Cleanup of Temporary Directories...
Mar 18 13:00:01 node-a systemd[1]: Started Cleanup of Temporary Directories.

● systemd-tmpfiles-clean.timer - Daily Cleanup of Temporary Directories
   Loaded: loaded (/usr/lib/systemd/system/systemd-tmpfiles-clean.timer; static; vendor preset: disabled)
   Active: active (waiting) since Fri 2022-03-18 12:44:40 CST; 1h 29min ago
     Docs: man:tmpfiles.d(5)
           man:systemd-tmpfiles(8)

Mar 18 12:44:40 node-a systemd[1]: Started Daily Cleanup of Temporary Directories.

● systemd-tmpfiles-setup-dev.service - Create Static Device Nodes in /dev
   Loaded: loaded (/usr/lib/systemd/system/systemd-tmpfiles-setup-dev.service; static; vendor preset: disabled)
   Active: active (exited) since Fri 2022-03-18 12:44:35 CST; 1h 29min ago
     Docs: man:tmpfiles.d(5)
           man:systemd-tmpfiles(8)
  Process: 556 ExecStart=/usr/bin/systemd-tmpfiles --prefix=/dev --create --boot (code=exited, status=0/SUCCESS)
 Main PID: 556 (code=exited, status=0/SUCCESS)
   CGroup: /system.slice/systemd-tmpfiles-setup-dev.service

Mar 18 12:44:35 node-a systemd[1]: Starting Create Static Device Nodes in /dev...
Mar 18 12:44:35 node-a systemd[1]: Started Create Static Device Nodes in /dev.

● systemd-tmpfiles-setup.service - Create Volatile Files and Directories
   Loaded: loaded (/usr/lib/systemd/system/systemd-tmpfiles-setup.service; static; vendor preset: disabled)
   Active: active (exited) since Fri 2022-03-18 12:44:38 CST; 1h 29min ago
     Docs: man:tmpfiles.d(5)
           man:systemd-tmpfiles(8)
  Process: 805 ExecStart=/usr/bin/systemd-tmpfiles --create --remove --boot --exclude-prefix=/dev (code=exited, status=0/SUCCESS)
 Main PID: 805 (code=exited, status=0/SUCCESS)
   CGroup: /system.slice/systemd-tmpfiles-setup.service

Mar 18 12:44:38 node-a systemd[1]: Starting Create Volatile Files and Directories...
Mar 18 12:44:38 node-a systemd[1]: Started Create Volatile Files and Directories.
[root@node-a system]# vim systemd-tmpfiles-clean.service 

清理工作一般分成两类:

开机启动时的清理
- systemd-tmpfiles-setup.service
- systemd-tmpfiles-setup-dev.service
Linux正常运行期间的清理
- systemd-tmpfiles-clean.timer

开机清理，这个是比较容易理解，开机的时候，可能所有服务都没有启动，上一轮机器运行阶段，可能产生了不少垃圾文件，或者没来得及清理的文件（比如异常掉电或者reboot -f），需要对具体的目录做一些清理动作。

另外一个就是长时间运行状态下，也要有服务负责清理，因为Linux服务器一般不喜欢关机，我们也有很多机器线上运行时间1000天以上，完全指望开机清理，可能势必垃圾文件堆积成山。

我们一起看下sytemd-tmpfiles-setup.service配置文件的内容：

[Unit]
Description=Create Volatile Files and Directories
Documentation=man:tmpfiles.d(5) man:systemd-tmpfiles(8)
DefaultDependencies=no
Conflicts=shutdown.target
After=systemd-readahead-collect.service systemd-readahead-replay.service local-fs.target systemd-sysusers.service
Before=sysinit.target shutdown.target
RefuseManualStop=yes

[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/usr/bin/systemd-tmpfiles --create --remove --boot --exclude-prefix=/dev

该文件可以看到，主要是调用一个名称为/usr/bin/systemd-tmpfiles的可执行文件，传了一些参数。sytemd-tmpfiles有一些参数值得我们注意：

create：

If this option is passed, all files and directories marked with f, F, w, d, D, v, p, L, c, b, m in the configuration files are created
           or written to. Files and directories marked with z, Z, t, T, a, and A have their ownership, access mode and security labels set.

remove

If this option is passed, the contents of directories marked with D or R, and files or directories themselves marked with r or R are removed.

clean

If this option is passed, all files and directories with an age parameter configured will be cleaned up.

boot

Also execute lines with an exclamation mark. (带感叹号的行， --boot才会执行)

boot最好理解，正常运行期间和重启刚开机是又分别的，有些目录，开机阶段可以粗暴删除，但是正常运行期间不能太粗暴，因为你可能没有办法确定，该文件或者目录是否还在被使用。因此后面配置文件会有区分的手段，剧透一下就是 action后面加感叹号，就是只有boot阶段才应该执行，正常运行期间，本条配置可以忽略。

systemd-tmpfiles 的可配置性

我们讲了很多了，但是到底如何告知systemd那些文件可以被清理，可被清理的文件又遵循什么样的策略呢？这个地方就到了配置文件部分了。

配置文件的名称必须符合 package.conf 或 package-part.conf格式。当需要明确的将某部分(part)配置提取出来，以方便用户专门针对这部分进行修改的时候，应该使用第二种命名格式。

对于不同目录下的同名配置文件，仅以优先级最高的目录中的那一个为准。具体说来就是： /etc/tmpfiles.d 的优先级最高、 /run/tmpfiles.d 的优先级居中、 /usr/lib/tmpfiles.d 的优先级最低。软件包应该将自带的配置文件安装在 /usr/lib/tmpfiles.d 目录中，而 /etc/tmpfiles.d 目录仅供系统管理员使用。所有的配置文件，无论其位于哪个目录中，都统一按照文件名的字典顺序处理。如果在多个配置文件中设置了同一个路径(文件或目录)，那么仅以文件名最靠前(字典顺序)的那一个为准，其他针对同一个路径的配置项将会作为警告信息记录到错误日志中。如果有两行的路径互为前后缀，那么始终是先创建前缀行、再创建后缀行，如果还需要删除，那么顺序正好相反，始终是先删除后缀行、再删除前缀行。所有带有shell风格通配符的行，都在所有不带通配符的行之后处理。如果有多个操作符应用于同一个文件(例如 ACL, xattr, 文件属性调整)，那么将始终按固定的顺序操作。除上述清空之外，对于其他情况，文件与目录总是按照它们在配置文件中出现的顺序处理。

[root@node-a system]# cd /usr/lib/tmpfiles.d/
[root@node-a tmpfiles.d]# ll
total 132
-rw-r--r--. 1 root root   29 Mar 16 04:33 ceph-common.conf
-rw-r--r--. 1 root root   35 Apr  1  2020 cryptsetup.conf
-rw-r--r--. 1 root root   26 Mar 16 05:02 ctdb.conf
-rw-r--r--. 1 root root   67 Dec 11 08:32 elasticsearch.conf
-rw-r--r--. 1 root root  464 Nov 17  2020 etc.conf
-rw-r--r--. 1 root root   77 Nov 16  2020 httpd.conf
-rw-r--r--. 1 root root   39 Nov 17  2020 initscripts.conf
-rw-r--r--. 1 root root   75 Dec 15  2020 iscsi.conf
-rw-r--r--. 1 root root 1181 Nov 17  2020 legacy.conf
-rw-r--r--. 1 root root   34 Apr  1  2020 libselinux.conf
-r--r--r--. 1 root root   61 Dec 16  2020 lvm2.conf
-rw-r--r--. 1 root root   34 Oct  1  2020 mdadm.conf
-rw-r--r--. 1 root root   35 Jan 22  2021 net-snmp.conf
-rw-r--r--. 1 root root   27 Sep 30  2020 nscd.conf
-rw-r--r--. 1 root root  104 Sep 30  2020 nss-pam-ldapd.conf
-rw-r--r--. 1 root root   85 Oct  1  2020 openldap.conf
-rw-r--r--. 1 root root  110 Apr  1  2020 pam.conf
-rw-r--r--. 1 root root   15 Apr 10  2017 proftpd.conf
-rw-r--r--. 1 root root   16 Nov 17  2020 python.conf
-rw-r--r--. 1 root root   42 Nov 17  2020 resource-agents.conf
-rw-r--r--. 1 root root   87 Apr  1  2020 rpcbind.conf
-rw-r--r--. 1 root root   22 Oct  1  2020 rpm.conf
-rw-r--r--. 1 root root   60 Mar 16 05:02 samba.conf
-rw-r--r--. 1 root root  228 Nov 17  2020 sap.conf
-rw-r--r--. 1 root root   72 Oct  1  2020 screen.conf
-rw-r--r--. 1 root root  137 Nov 17  2020 selinux-policy.conf
-rw-r--r--. 1 root root  305 Jan 27  2021 sudo.conf
-rw-r--r--. 1 root root 1662 Nov 17  2020 systemd.conf
-rw-r--r--. 1 root root  496 Nov 17  2020 systemd-nologin.conf
-rw-r--r--. 1 root root  638 Nov 17  2020 tmp.conf
-rw-r--r--. 1 root root   56 Mar 22  2019 tuned.conf
-rw-r--r--. 1 root root  563 Nov 17  2020 var.conf
-rw-r--r--. 1 root root  623 Nov 17  2020 x11.conf
[root@node-a tmpfiles.d]# 
[root@node-a tmpfiles.d]# cd /etc/tmpfiles.d/
[root@node-a tmpfiles.d]# ll
total 0
[root@node-a tmpfiles.d]# 

主要的配置文件都被存放在/etc/tmpfiles.d和 /usr/lib/tmpfiles.d/下。其中/usr/lib/tmpfiles.d下可以看到有形形色色的配置文件。Ubuntu版本的产品比较神器，可以自动清理/tmp/sessions下的文件，甚至整个/tmp/sessions都被删除，我们看下配置文件：

root@CVM01:/usr/lib/tmpfiles.d# cat tmp.conf 
D /tmp 1777 root root -
#q /var/tmp 1777 root root 30d

# Exclude namespace mountpoints created with PrivateTmp=yes
x /tmp/systemd-private-%b-*
X /tmp/systemd-private-%b-*/tmp
x /var/tmp/systemd-private-%b-*
X /var/tmp/systemd-private-%b-*/tmp

这里除了我们比较熟悉的路径意外，有写D x X之类的奇奇怪怪的字母，这些何意？

systemd-tmpfiles 配置语法

tmpfiles.d 配置文件定义了一套临时文件管理机制：创建文件、目录、管道、设备节点，调整访问模式、所有者、属性、限额、内容，删除过期文件。主要用于管理易变的临时文件与目录，例如 /run, /tmp, /var/tmp, /sys, /proc, 以及 /var 下面的某些目录

配置文件的格式是每行对应一个路径，包含如下字段：类型, 路径, 权限, 属主, 属组, 寿命, 参数

#Type Path        Mode User Group Age Argument
d     /run/user   0755 root root  10d -
L     /tmp/foobar -    -    -     -   /dev/null

字段值可以用引号界定，并可以包含C风格的转义字符

然后就到了奇奇怪怪的字母部分了，因为它支持的Type太多了，我就一一罗列的，否则也记不住，我重点介绍常用的，如果需要了解细节的，可以阅读金步国的文章。

d 创建指定的目录并赋于指定的User/Group与权限。如果指定的目录已经存在，那么仅调整User/Group与权限。如果指定了”寿命”字段，那么该目录中的内容将遵守基于时间的清理策略。
D 与 d 类似，但是如果使用了 --remove 选项，那么将会清空目录中的所有内容。

注意此处，我们看下Ubuntu的tmp.conf配置文件里面的一行：

D /tmp 1777 root root -

这一行毁天灭地，如果有印象的话，systemd-tmpfiles-setup.service执行的是：

ExecStart=/usr/bin/systemd-tmpfiles --create --remove --boot --exclude-prefix=/dev

这会将/tmp/下的所有东西一扫而空。但是Ubuntu版本有些人会说，没有，启动之后/tmp/目录下还有文件。这些文件是重启之后创建出来的。因此我们就明白了为什么Ubuntu下，/tmp/sessions会被清理，因为实时上不仅它会被清理，整个/tmp/都会被清理。

x 根据”寿命”字段清理过期文件时，忽略指定的路径及该路径下的所有内容。可以在”路径”字段中使用shell风格的通配符。注意，这个保护措施对 r 与 R 无效。
R 在根据”寿命”字段清理过期文件时，仅忽略指定的路径自身而不包括该路径下的其他内容。可以在”路径”字段中使用shell风格的通配符。注意，这个保护措施对 r 与 R 无效。

这两个是额外的保护，即按照时间线来清理的时候，免死金牌，不被清理。但是这个保护措施对后面提到的r和R无效

r 若指定的文件或目录存在，则删除它。不可用于非空目录。可以在”路径”字段中使用shell风格的通配符。不追踪软连接。
R 若指定的文件或目录存在，则递归的删除它。可用于非空目录。可以在”路径”字段中使用shell风格的通配符。不追踪软连接。

这两个删除指令，R可递归，r不可。支持通配符。我们欣赏下Ubuntu 20.04版本里的systemd-tmp.conf

# Exclude namespace mountpoints created with PrivateTmp=yes
x /tmp/systemd-private-%b-*
X /tmp/systemd-private-%b-*/tmp
x /var/tmp/systemd-private-%b-*
X /var/tmp/systemd-private-%b-*/tmp

# Remove top-level private temporary directories on each boot
R! /tmp/systemd-private-*
R! /var/tmp/systemd-private-*

上面赦免不用说了，正常清理过程中，不要按照时间清理/tmp/systemd-private/打头的文件。但是后面R！表示机器重启期间，要将上述文件彻底删除，防止留下垃圾。

使用了感叹号(!)标记的行，仅可在系统启动过程中执行，而不能用于运行中的系统(会破坏系统的正常运行)。未使用感叹号(!)标记的行，可以在任意时间安全的执行(例如升级软件包的时候)。 systemd-tmpfiles 仅在明确使用了 --boot 选项的时候才会执行使用了感叹号(!)标记的行。

分析CentOS版本的行为

# Clear tmp directories separately, to make them easier to override
v /tmp 1777 root root 10d
v /var/tmp 1777 root root 30d

# Exclude namespace mountpoints created with PrivateTmp=yes
x /tmp/systemd-private-%b-*
X /tmp/systemd-private-%b-*/tmp
x /var/tmp/systemd-private-%b-*
X /var/tmp/systemd-private-%b-*/tmp

啥也不说了。根本就不清理 /tmp/system-private-文件。

如何调试systemd-tmpfiles

env SYSTEMD_LOG_LEVEL=debug systemd-tmpfiles --clean

这种方式比较详细，会把中间的判断过程也打印出来，如果清理行为不理解的话，可以用这个方法，看下为什么行为和自己理解的不一样。

[root@node-a tmpfiles.d]# env SYSTEMD_LOG_LEVEL=debug systemd-tmpfiles --clean
Reading config file "/usr/lib/tmpfiles.d/ceph-common.conf".
Reading config file "/usr/lib/tmpfiles.d/cryptsetup.conf".
Reading config file "/usr/lib/tmpfiles.d/ctdb.conf".
Reading config file "/usr/lib/tmpfiles.d/elasticsearch.conf".
Reading config file "/usr/lib/tmpfiles.d/etc.conf".
Reading config file "/usr/lib/tmpfiles.d/httpd.conf".
Reading config file "/usr/lib/tmpfiles.d/initscripts.conf".
Reading config file "/usr/lib/tmpfiles.d/iscsi.conf".
Reading config file "/run/tmpfiles.d/kmod.conf".
Ignoring entry c! "/dev/fuse" because --boot is not specified.
Ignoring entry c! "/dev/cuse" because --boot is not specified.
Ignoring entry c! "/dev/btrfs-control" because --boot is not specified.
Ignoring entry c! "/dev/loop-control" because --boot is not specified.
Ignoring entry c! "/dev/net/tun" because --boot is not specified.
Ignoring entry c! "/dev/ppp" because --boot is not specified.
Ignoring entry c! "/dev/uinput" because --boot is not specified.
Ignoring entry c! "/dev/mapper/control" because --boot is not specified.
Ignoring entry c! "/dev/uhid" because --boot is not specified.
Ignoring entry c! "/dev/vfio/vfio" because --boot is not specified.
Ignoring entry c! "/dev/vhci" because --boot is not specified.
Ignoring entry c! "/dev/vhost-net" because --boot is not specified.
....

如果带上参数 –boot 和–remove，模拟的基本就是开机启动时的清理逻辑：

env SYSTEMD_LOG_LEVEL=debug systemd-tmpfiles --clean

参考文献：

金步国的文章
man systemd-tmpfiles

RAID 卡温度及风扇转速调节

Bean Li — 2022-03-06T10:29:00+00:00

前言

本文讨论RAID卡温度以及潜在的影响。

查看RAID卡的温度

root@scanode1s:~# storcli64 /c0 show all |grep -i temp
Support Temperature = Yes
Temperature Sensor for ROC = Present
Temperature Sensor for Controller = Absent
ROC temperature(Degree Celsius) = 54
Model  State   Temp Mode MfgDate    Next Learn
root@scanode1s:~#

其中ROC temperature 即是我们需要关心的温度。一般来讲，该温度的合理值55摄氏度附近。目前空调状态不太好的机房，可能也会涨到60～80摄氏度。

如果R OC Temperature温度超过105摄氏度，从RAID卡的角度，你就会看到如下类似的日志：

WARNING:Controller temperature threshold exceeded. This may indicate inadequate system cooling, switch to low performace mode.

这种比较可怕，可能会出现大面积掉盘的情况。我们曾遇到客户，冬季客户主动把机房的空调关掉了，结果很快就有盘从RAID组中离线，RAID变成Degrade的情况。

因此，对于一个服务器而言，实时监控RAID卡的散热情况，也是非常重要的。

影响RAID温度的要素

RAID卡温度的要素，无非有如下几个要素

机房温度
风扇转速
磁盘业务压力
RAID组内的一致性检查等带来磁盘I/O的行为

注意，计算机房温度高，风扇有问题等问题存在，但是如果没有任何磁盘I/O基本上也不会导致RAID温度过好。我们遇到的多次RAID卡温度过高，都是糟糕的散热条件，遇到了较高的业务压力，又碰上了一致性检查，多个条件一起作用，终于RAID卡温度飙高不下。

如果发现机房的散热条件不好，或者机器老化等要素，可以针对性地调整一致性检查的速度和模式

调整一致性检查的模式从ModeConc改成ModeSeq，串型模式
调整CCRate从默认的30，调整成15
一致性检查的时间，可以调整成夜间12点这种业务和温度比较低的时间。

风扇转速

我们以超微主板为例，风扇有相关的运行模式 Fan Mode：

Standard Speed
Full Speed
Optimal Speed
HeavyIO Speed

风扇按照控制区域来分，分成两类：

CPU or system Fans，一般被标记成 FAN0 FAN1 FAN2 ，命名方式为FAN+数字，这部分为Zone 0
Peripheral zone Fans，一般被命名为FANA FANB FANC，明明方式为FAN+字母，这部分为Zone 1

上面提到的四种模式：

Standard： BMC 同时控制两个zone， with CPU Zone base CPU temp（target speed 50%），and Peripheral zone based on PCH temp （with target speed 50%）
Optimal： BMC Control of the CPU zone （target speed 30%），with Peripheral zone fixed at low speed （fixed ～30%）
Full： all Fans running at 100%
HeavyIO ： BMC control both CPU zone （target speed 50%） and Peripheral zone fixed at 75%

如果像存储服务器这种，Optimal肯定是不合适了，Full的话也不太合适，因为太吵，可选的就是两个，Standard和HeavyIO。如果保守起见，可以选择Heavy IO，防止散热不好的情况下，RA ID卡温度过高。

如何调整风扇转速和模式

调整风扇模式

我们以全速模式为例，如何讲风扇调整为全速模式：

ipmitool 0x30 0x45 0x01 0x01

注意倒数第二个0x01表示的是Zone：

0x00 表示的是zone 0，即负责CPU zone的风扇
0x01 表示的是zone 1

最后一个0x01 表示的是模式：

standard ：0
Full： 1
Optimal： 2
HeavyIO：4

我们故意做个测试，来看下将风扇模式调整成Full的效果：

调整成Full模式之后，很快的时间内温度就下降下来了。

调整转速

Full模式虽然开心，效果明显，但是很明显噪音很大。所以100%的风扇转速虽然爽，但是忍受不了噪音。那如何处理？

ipmitool raw 0x30 0x70 0x66 0x01 0x 0x

z的合法值为0 和1 ，其中0表示Zone 0， 1 表示Zone 1.

n的合法值是从0x00 到0x64 ，即从0%到100%。

比如说我们觉得Full模式的100%太吵，Heavy IO模式的75%效果虽然不错，但是也太吵，我们可以将Zone1的百分比调整成60%。

 ipmitool raw 0x30 0x70 0x66 0x01 0x1 0x3C

总结

下面总结是对于存储服务器而言的，并非针对所有应用场景

RAID卡的温度要实时监测，确保运行稳定
风扇模式有4种，Full和Optimal都不可取，Standard和Heavy IO可以选择
对于转速不满意的，可以通过ipmitool 指令调节转速，使其在合理范围内调节。

查看Bucket相关的metadata

Bean Li — 2021-12-26T10:29:00+00:00

前言

本文介绍，如何查看bucket相关的元数据信息。

root@CVM01:~# radosgw-admin metadata get bucket:bkt_test
{
    "key": "bucket:bkt_test",
    "ver": {
        "tag": "_avOPFvyGt4fut9GiJPsIXeK",
        "ver": 1
    },
    "mtime": "2021-04-22 07:22:12.706675Z",
    "data": {
        "bucket": {
            "name": "bkt_test",
            "marker": "7240a9f2-42ea-4f3d-b534-ad4d867ab9cb.372941258.1",
            "bucket_id": "7240a9f2-42ea-4f3d-b534-ad4d867ab9cb.372941258.1",
            "tenant": "",
            "explicit_placement": {
                "data_pool": "",
                "data_extra_pool": "",
                "index_pool": ""
            }
        },
        "owner": "s3user1",
        "creation_time": "2021-04-22 07:22:12.559921Z",
        "linked": "true",
        "has_bucket_info": "false"
    }
}
root@CVM01:~# radosgw-admin metadata get bucket.instance:bkt_test:7240a9f2-42ea-4f3d-b534-ad4d867ab9cb.372941258.1
{
    "key": "bucket.instance:bkt_test:7240a9f2-42ea-4f3d-b534-ad4d867ab9cb.372941258.1",
    "ver": {
        "tag": "_CPltbMF1_HktM5SUnn1zwPn",
        "ver": 4
    },
    "mtime": "2021-04-22 07:58:32.656184Z",
    "data": {
        "bucket_info": {
            "bucket": {
                "name": "bkt_test",
                "marker": "7240a9f2-42ea-4f3d-b534-ad4d867ab9cb.372941258.1",
                "bucket_id": "7240a9f2-42ea-4f3d-b534-ad4d867ab9cb.372941258.1",
                "tenant": "",
                "explicit_placement": {
                    "data_pool": "",
                    "data_extra_pool": "",
                    "index_pool": ""
                }
            },
            "creation_time": "2021-04-22 07:22:12.559921Z",
            "owner": "s3user1",
            "flags": 6,
            "zonegroup": "0a4d68f0-0a16-4e0c-a88b-d0b708ce75f1",
            "placement_rule": "default-placement",
            "has_instance_obj": "true",
            "quota": {
                "enabled": false,
                "check_on_raw": false,
                "max_size": -1,
                "max_size_kb": 0,
                "max_objects": -1
            },
            "num_shards": 128,
            "bi_shard_hash_type": 0,
            "requester_pays": "false",
            "has_website": "false",
            "swift_versioning": "false",
            "swift_ver_location": "",
            "index_type": 0,
            "mdsearch_config": [],
            "reshard_status": 0,
            "new_bucket_instance_id": "",
            "worm": {
                "enable": false
            }
        },
        "attrs": [
            {
                "key": "user.rgw.acl",
                "val": "AgKNAAAAAwIWAAAABwAAAHMzdXNlcjEHAAAAczN1c2VyMQQDawAAAAEBAAAABwAAAHMzdXNlcjEPAAAAAQAAAAcAAABzM3VzZXIxBQM6AAAAAgIEAAAAAAAAAAcAAABzM3VzZXIxAAAAAAAAAAACAgQAAAAPAAAABwAAAHMzdXNlcjEAAAAAAAAAAAAAAAAAAAAA"
            },
            {
                "key": "user.rgw.idtag",
                "val": ""
            },
            {
                "key": "user.rgw.lc",
                "val": "AQGJAAAAAQAAAAcAAABkZWZhdWx0BgF0AAAABwAAAGRlZmF1bHQAAAAABwAAAEVuYWJsZWQDAggAAAAAAAAAAAAAAAMCCQAAAAEAAAA1AAAAAAMCCAAAAAAAAAAAAAAAAAEBBAAAAAAAAAABAQwAAAAAAAAAAAAAAAAAAAABAQwAAAAAAAAAAAAAAAAAAAA="
            }
        ]
    }
}

如何修改bucket index shards

root@CVM01:~# radosgw-admin zonegroup get >  tmp.conf
{
    "id": "0a4d68f0-0a16-4e0c-a88b-d0b708ce75f1",
    "name": "default",
    "api_name": "",
    "is_master": "true",
    "endpoints": [],
    "hostnames": [],
    "hostnames_s3website": [],
    "master_zone": "7240a9f2-42ea-4f3d-b534-ad4d867ab9cb",
    "zones": [
        {
            "id": "7240a9f2-42ea-4f3d-b534-ad4d867ab9cb",
            "name": "default",
            "endpoints": [],
            "log_meta": "false",
            "log_data": "false",
            "bucket_index_max_shards": 0,
            "read_only": "false",
            "tier_type": "",
            "sync_from_all": "true",
            "sync_from": []
        }
    ],
    "placement_targets": [
        {
            "name": "default-placement",
            "tags": []
        }
    ],
    "default_placement": "default-placement",
    "realm_id": ""
}

修改 bucket_index_max_shards = 64 之后，然后执行：

radosgw-admin region set < tmp.conf

IFF_UP 与 IFF_RUNNING

Bean Li — 2021-10-23T10:29:00+00:00

前言

近期，发现判断网卡是否Ready的程序出了一些问题，有些网卡明明没有插线，依然会被判定为active的网卡。我去跟踪了下代码：

# struct ifreq { // FOR SIOCGIFFLAGS:
#   char ifrn_name[IFNAMSIZ]
#   short ifru_flags
# };
my $STRUCT_IFREQ_SIOCGIFFLAGS = 'Z' . IFNAMSIZ . 's1';
sub get_active_network_interfaces {
    # Use the interface name list from /proc/net/dev
    open my $fh, '<', '/proc/net/dev'
	or die "failed to open /proc/net/dev: $!\n";
    # And filter by IFF_UP flag fetched via a PF_INET6 socket ioctl:
    my $sock;
    socket($sock, PF_INET6, SOCK_DGRAM, &IPPROTO_IP)
    or socket($sock, PF_INET, SOCK_DGRAM, &IPPROTO_IP)
    or return [];

    my $ifaces = [];
    while(defined(my $line = <$fh>)) {
	next if $line !~ /^\s*([^:\s]+):/;
	my $ifname = $1;
	my $ifreq = pack($STRUCT_IFREQ_SIOCGIFFLAGS, $ifname, 0);
	if (!defined(ioctl($sock, SIOCGIFFLAGS, $ifreq))) {
	    warn "failed to get interface flags for: $ifname\n";
	    next;
	}
	my ($name, $flags) = unpack($STRUCT_IFREQ_SIOCGIFFLAGS, $ifreq);
	push @$ifaces, $ifname if ($flags & IFF_UP);
    }
    close $fh;
    close $sock;
    return $ifaces;
}

发现源码使用通过调用 ioctl SIOCGIFFLAGS 来判定网卡是否已经ready。

关于SIOCGIFFLAGS

Linux ioctl支持一下两个标志位：

SIOCGIFFLAGS：获取设备的活动标志位
SIOCSIFFLAGS：修改设备的活动标志位

       SIOCGIFFLAGS, SIOCSIFFLAGS
              Get or set the active flag word of the device.  ifr_flags contains a bit mask of the following values:

                                           Device flags
              IFF_UP            Interface is running.
              IFF_BROADCAST     Valid broadcast address set.
              IFF_DEBUG         Internal debugging flag.
              IFF_LOOPBACK      Interface is a loopback interface.

              IFF_POINTOPOINT   Interface is a point-to-point link.
              IFF_RUNNING       Resources allocated.
              IFF_NOARP         No arp protocol, L2 destination address not set.
              IFF_PROMISC       Interface is in promiscuous mode.
              IFF_NOTRAILERS    Avoid use of trailers.
              IFF_ALLMULTI      Receive all multicast packets.
              IFF_MASTER        Master of a load balancing bundle.
              IFF_SLAVE         Slave of a load balancing bundle.
              IFF_MULTICAST     Supports multicast
              IFF_PORTSEL       Is able to select media type via ifmap.
              IFF_AUTOMEDIA     Auto media selection active.
              IFF_DYNAMIC       The addresses are lost when the interface goes down.
              IFF_LOWER_UP      Driver signals L1 up (since Linux 2.6.17)
              IFF_DORMANT       Driver signals dormant (since Linux 2.6.17)
              IFF_ECHO          Echo sent packets (since Linux 2.6.25)本次我们重点介绍IFF_UP和 IFF_RUNNING。

IFF_UP：这个标志位表示的是从管理上讲，这个网口是UP的，是Ready的，但是并不代表连接状态。即网卡已经准备好了，但是并不意味着网线已插。

IFF_RUNNING：这个标志位表示operational state，如果置位的话，表示CONNECTED的。

验证

做个简单验证：

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#define BUFSIZE 1024

int main(int argc, char *argv[])
{
	struct ifreq ifr;
	int sfd ;

	memset(&ifr, '\0' , sizeof(struct ifreq));
	strcpy(ifr.ifr_name, argv[1]);
	sfd = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);

	int ret = ioctl(sfd, SIOCGIFFLAGS, (char *)&ifr);
	if(ret !=0)
	{
		printf("failed to exec ioctl");
		goto out;
	}

	printf("%10s IFF_UP: %d\n", argv[1], !!(ifr.ifr_flags & IFF_UP));
	printf("%10s IFF_RUNNING: %d \n",argv[1], !!(ifr.ifr_flags & IFF_RUNNING));

out:
	close(sfd);
	exit(ret);
}

我的笔记本的网口情况如下：

manu-latitude3510 CODE/C » cat /proc/net/dev
Inter-|   Receive                                                |  Transmit
 face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed
    lo: 1503282   11818    0    0    0     0          0         0  1503282   11818    0    0    0     0       0          0
  eno1:       0       0    0    0    0     0          0         0        0       0    0    0    0     0       0          0
wlp1s0: 14947732561 40405691    0 822774    0     0          0         0 650855154 3591298    0    0    0     0       0          0
manu-latitude3510 CODE/C »

其中eno1 是有线，并未插线，wlp1s0是无线，已经连接的状态。

manu-latitude3510 CODE/C » ./nettool eno1                                                                                                                
      eno1 IFF_UP: 1
      eno1 IFF_RUNNING: 0
manu-latitude3510 CODE/C » ./nettool wlp1s0
    wlp1s0 IFF_UP: 1
    wlp1s0 IFF_RUNNING: 1
manu-latitude3510 CODE/C » ./nettool lo
        lo IFF_UP: 1
        lo IFF_RUNNING: 1
manu-latitude3510 CODE/C

上图不说话：关闭服务器节能模式

Bean Li — 2021-03-07T10:29:00+00:00