Linux Cheatsheet 11 — Text Processing

Text processing cheatsheet.

grep

grep "pattern" file
grep -i pattern file                # case-insensitive
grep -v pattern file                # invert
grep -r pattern dir/                # recursive
grep -rn pattern dir/               # with line numbers
grep -l pattern dir/                # filenames only
grep -c pattern file                # count
grep -E "a|b" file                  # regex (or use grep -e)
grep -P "\d+" file                  # perl regex
grep -A 3 pattern file              # 3 lines after
grep -B 3 pattern file              # before
grep -C 3 pattern file              # context

grep --include="*.py" -r pattern .
grep --exclude-dir=node_modules -r pattern .

ripgrep (faster)

rg pattern
rg -t py pattern                    # only Python files
rg -g '*.py' pattern
rg --json pattern | jq
rg -A 3 pattern

sed

sed 's/old/new/' file               # first occurrence per line
sed 's/old/new/g' file              # all
sed -i 's/old/new/g' file           # in place
sed -i.bak 's/old/new/g' file       # backup as file.bak
sed '5,10d' file                    # delete lines 5-10
sed '/pattern/d' file               # delete matching
sed -n '/pattern/p' file            # print only matching
sed 's|/old/path|/new/path|g'       # alt delimiter
sed -E 's/([0-9]+)/<\1>/g'          # extended regex + backref

awk

awk '{print $1}' file               # first column
awk '{print $1, $3}' file
awk -F: '{print $1}' /etc/passwd    # custom delimiter
awk '/pattern/ {print $1}'
awk 'NR>1' file                     # skip header
awk 'NR%2==0' file                  # even lines
awk '{sum+=$1} END {print sum}'
awk '{count[$1]++} END {for (k in count) print k, count[k]}'
awk 'length($0) > 80'               # long lines
awk -v threshold=100 '$2 > threshold'

cut

cut -d: -f1 /etc/passwd
cut -d, -f2,4 csv
cut -c1-10 file                     # chars 1-10
cut -c1,5,10 file                   # specific chars

tr

echo "HELLO" | tr A-Z a-z
echo "a b c" | tr ' ' '\n'
echo "a-b-c" | tr -d '-'            # delete
tr -s ' '                           # squeeze repeats
cat file | tr -cd 'a-zA-Z0-9 \n'    # keep only these

sort / uniq

sort file
sort -n file                        # numeric
sort -r file                        # reverse
sort -k 2 file                      # by 2nd column
sort -t: -k3 -n /etc/passwd         # by UID
sort -u file                        # unique

sort | uniq                         # uniq needs sorted
sort | uniq -c                      # with count
sort | uniq -d                      # only dupes
sort | uniq -u                      # only uniques

# Top 10 most common
sort | uniq -c | sort -rn | head -10

head / tail

head -n 20 file
tail -n 20 file
tail -f file                        # follow
tail -F file                        # follow + reopen
tail -n +10 file                    # from line 10 to end

wc

wc -l file                          # lines
wc -w file                          # words
wc -c file                          # bytes
wc -m file                          # chars

paste / join

paste a.txt b.txt                   # side-by-side
paste -d, a.txt b.txt               # comma delim
join file1 file2                    # SQL-like join on first col

column

column -t file                      # align columns
ps aux | column -t

xargs

ls | xargs rm                       # like loop
find . -name "*.log" | xargs rm
find . -print0 | xargs -0 rm        # handles spaces
echo "a b c" | xargs -n 1           # one arg per line
echo "a b c" | xargs -I{} echo "got: {}"
xargs -P 4 -n 1 cmd                 # parallel

jq

echo '{"a":1}' | jq '.a'
jq '.users[] | .name' file.json
jq '.users[].name' file.json
jq '.users | length'
jq '.users[] | select(.active)'
jq '.users[] | {name, email}'
jq -r '.users[].name'               # raw (no quotes)
jq -c '.'                           # compact
jq 'to_entries | map(...)'

yq (YAML)

yq '.spec.replicas' deploy.yaml
yq -i '.spec.replicas = 5' deploy.yaml

diff / patch

diff a b
diff -u a b > patch
patch a < patch
patch -R a < patch                  # reverse

Common pipelines

# Top processes by mem
ps aux | sort -k4 -rn | head

# Count log levels
grep -oE 'level=\w+' app.log | sort | uniq -c | sort -rn

# Sum bytes from access log
awk '{sum+=$10} END {print sum}' access.log

# 95th percentile
awk '{print $9}' access.log | sort -n | awk '{a[NR]=$1} END {print a[int(NR*0.95)]}'

# IP rank
awk '{print $1}' access.log | sort | uniq -c | sort -rn | head

# Files by line count
find . -name "*.py" -exec wc -l {} + | sort -rn

# Disk usage by extension
find /var -type f -printf '%s %p\n' | awk '{ext=$2; sub(/.*\./,"",ext); s[ext]+=$1} END {for(k in s) print s[k], k}' | sort -rn

Common mistakes

grep with regex without -E / -P — limited features.
sed -i without backup → no undo.
awk $0 vs $1: $0 whole line, $1 first field.
xargs without -0 and spaces in names breaks.
Forgetting sort before uniq.

grep#

ripgrep (faster)#

sed#

awk#

cut#

tr#

sort / uniq#

head / tail#

wc#

paste / join#

column#

xargs#

jq#

yq (YAML)#

diff / patch#

Common pipelines#

Common mistakes#

Read this next#

grep

ripgrep (faster)

sed

awk

cut

tr

sort / uniq

head / tail

wc

paste / join

column

xargs

jq

yq (YAML)

diff / patch

Common pipelines

Common mistakes

Read this next