Trouble Shooting
Core dumps are typically the result of a programming error. The core dump settings are generally disabled on most Linux distributions. To enable core dumps you may execute "ulimit -c ulimited" before executing your application. You could place the ulimit command in your shell startup file. You do not require root permissions to enable the creation of core dumps.
ulimit -a # to display your ulimit settings
ulimit -c unlimited # to enable creation of core dump files
When trying to umount an NFS share, and it won't release, use the lazy option
umount -l /some-path-name
Find out what manpages discuss some "topic"
apropos topic
For example:
[sandholm@sys1-lnx ~]$ apropos trace
autrace (8) - a program similar to strace
cluck [Carp] (3pm) - warn of errors with stack backtrace (not exported
by default)
confess [Carp] (3pm) - die of errors with stack backtrace
groff_trace (7) - groff macro package trace.tmac
ltrace (1) - A library call tracer
mtrace (3) - malloc debugging
muntrace [mtrace] (3) - malloc debugging
ntptrace (1) - trace a chain of NTP servers back to the primary source
pstack (1) - print a stack trace of a running process
ptrace (2) - process trace
strace (1) - trace system calls and signals
trace [curs_trace] (3x) - curses debugging routines
tracepath (8) - traces path to a network host discovering MTU along this path
tracepath6 [tracepath] (8) - traces path to a network host discovering MTU along this path
traceroute (8) - print the route packets take to network host
[sandholm@sys1-lnx ~]$
Find all files that have changed in the last 30 minutes:
find /pathname -mmin -30
For example:
[root@sys1-lnx log]# find /var/log -mmin -30
/var/log/sa/sa21
/var/log/mail/statistics
/var/log/cron
/var/log/puppet/masterhttp.log
/var/log/messages
/var/log/maillog
/var/log/secure
[root@sys1-lnx log]#
Find out the "type" of a file:
file filename
For example:
[sandholm@sys1-lnx ~]$ file debian_live.iso
debian_live.iso: ISO 9660 CD-ROM filesystem data 'Debian Live 20070123 ' (bootable)
[sandholm@sys1-lnx ~]$ file prog.tgz
prog.tgz: gzip compressed data, from Unix
[sandholm@sys1-lnx ~]$ file sw_query
sw_query: Bourne-Again shell script text executable
[sandholm@sys1-lnx ~]$ file telnet-0.17-25.src.rpm
telnet-0.17-25.src.rpm: RPM v3 src i386 telnet-0.17-25
[sandholm@sys1-lnx ~]$ file /bin/cat
/bin/cat: ELF 32-bit LSB executable, Intel 80386, version 1 (SYSV), for GNU/Linux 2.2.5, dynamically linked (uses shared libs), stripped
[sandholm@sys1-lnx ~]$ file jems-installer-1.2.0.BETA2.jar
jems-installer-1.2.0.BETA2.jar: Zip archive data, at least v2.0 to extract
[sandholm@sys1-lnx ~]$
Find out what processes are offering TCP services:
lsof -i tcp
lsof -i tcp -i udp
For example:
[root@sys1-lnx ~]# lsof -i tcp
COMMAND PID USER FD TYPE DEVICE SIZE NODE NAME
oracle 378 oracle 11u IPv4 130698079 TCP sys-lnx.intdata.com:1523->sys-lnx.intdata.com:56381 (ESTABLISHED)
ssh 1598 root 3u IPv4 55023089 TCP sys1-lnx:2133->ngfdev-lnx.intdata.com:ssh (ESTABLISHED)
oracle 1865 oracle 11u IPv4 140007433 TCP sys-lnx.intdata.com:1523->sys-lnx.intdata.com:52089 (ESTABLISHED)
isql 2785 clark 6u IPv4 164410437 TCP sys1-lnx:64323->gdb-sun.intdata.com:2045 (ESTABLISHED)
ssh 3031 root 3u IPv4 62177768 TCP sys1-lnx:29770->gums2-sun.intdata.com:ssh (ESTABLISHED)
ssh 3109 root 3u IPv4 56187735 TCP sys1-lnx:24655->ngfdev-lnx.intdata.com:ssh (ESTABLISHED)
oracle 3283 oracle 11u IPv4 139101609 TCP sys-lnx.intdata.com:1523->sys-lnx.intdata.com:44733 (ESTABLISHED)
java 3986 root 9u IPv4 91355834 TCP sys1-lnx:43065->ma2djharris.corp.us.intdata.com:x11 (ESTABLISHED)
java 3986 root 13u IPv4 91386558 TCP sys1-lnx:43902->gendev-lnx.intdata.com:2148 (ESTABLISHED)
tqdportal 4714 nobody 5u IPv4 164210008 TCP localhost.localdomain:58572 (LISTEN)
tqdportal 4714 nobody 6u IPv4 164210018 TCP localhost.localdomain:58572->localhost.localdomain:58573 (ESTABLISHED)
tqdportal 4714 nobody 9u IPv4 164210019 TCP localhost.localdomain:58572->localhost.localdomain:58586 (ESTABLISHED)
tqdportal 4714 nobody 17u IPv4 164289324 TCP sys1-lnx:60741->gdb-sun.intdata.com:2780 (CLOSE_WAIT)
tqdportal 4714 nobody 19u IPv4 164363017 TCP sys1-lnx:62995->gdb-sun.intdata.com:2780 (CLOSE_WAIT)
tqdportal 4714 nobody 21u IPv4 164302608 TCP sys1-lnx:61088->gdb-sun.intdata.com:2780 (CLOSE_WAIT)
tqdportal 4714 nobody 23u IPv4 164320030 TCP sys1-lnx:61621->gdb-sun.intdata.com:2780 (CLOSE_WAIT)
ssh 4882 root 3u IPv4 46760693 TCP sys1-lnx:21117->fis-sun.intdata.com:ssh (ESTABLISHED)
isql 5969 clark 6u IPv4 164421159 TCP sys1-lnx:64734->gdb-sun.intdata.com:2085 (ESTABLISHED)
oracle 6167 oracle 11u IPv4 130717030 TCP sys-lnx.intdata.com:1523->sys-lnx.intdata.com:57088 (ESTABLISHED)
ssh 6455 root 3u IPv4 47933581 TCP sys1-lnx:34558->fis-sun.intdata.com:ssh (ESTABLISHED)
userinfo. 6530 kramer 7u IPv4 62189281 TCP sys1-lnx:30116->gdb-sun.intdata.com:2085 (ESTABLISHED)
To find out if there's network performance problems:
netstat -s
For example:
[root@sys1-lnx ~]# netstat -s
Ip:
837843325 total packets received
0 forwarded
0 incoming packets discarded
837838258 incoming packets delivered
1510520829 requests sent out
Icmp:
29917 ICMP messages received
1170 input ICMP message failed.
ICMP input histogram:
destination unreachable: 16429
timeout in transit: 20
echo requests: 157
echo replies: 13305
17875 ICMP messages sent
0 ICMP messages failed
ICMP output histogram:
destination unreachable: 17718
echo replies: 157
Tcp:
5302702 active connections openings
3611353 passive connection openings
1865 failed connection attempts
1361865 connection resets received
161 connections established
793357298 segments received
1486387754 segments send out
244786 segments retransmited
0 bad segments received.
687200 resets sent
To strace an already running process:
strace -p pidnumber
Find out a process shell environment of a running process:
ps auxe
Find out what files a process has open
lsof -p pid-number-of-process
Show network interface cards that are configured & operational
ifconfig
Find out what process has a file open
fuser /path-to-file
Terminate process(s) that have a file open
fuser -k /path-to-file
Find out what libraries a program depends on
ldd /path-to-program
Find out what kernel options were used when the system was booted
cat /proc/cmdline
Find process that has the most files open:
as root
lsof | awk '{print $2}' | sort -n | uniq -c | sort -n
NOTE: We've had some issues with Oracle emagent consuming all the system file descriptors. Running the above script will indicate the count of open files per pid. If you find emagent has the highest file descriptor count, you may just kill it or restart it.
Here's a script that may be useful for this:
#! /bin/bash
out=$(lsof | awk '{print $2}' | sort -n | uniq -c | sort -n | tail -1)
set $out
echo "> Number of open files: $1"
echo "> Pid number: $2"
echo "> Process Table Entry:"
ps -lf -p $2
Find out what files a program is trying to open:
strace -eopen ls -al
Example output:
[sandholm@sys1-lnx ~]$ strace -eopen ls -al
open("/etc/ld.so.cache", O_RDONLY) = 3
open("/lib/tls/librt.so.1", O_RDONLY) = 3
open("/lib/libacl.so.1", O_RDONLY) = 3
open("/lib/libselinux.so.1", O_RDONLY) = 3
open("/lib/tls/libc.so.6", O_RDONLY) = 3
open("/lib/tls/libpthread.so.0", O_RDONLY) = 3
open("/lib/libattr.so.1", O_RDONLY) = 3
open("/etc/selinux/config", O_RDONLY) = 3
open("/proc/mounts", O_RDONLY) = 3
open("/usr/lib/locale/locale-archive", O_RDONLY|O_LARGEFILE) = 3
open("/proc/filesystems", O_RDONLY) = 3
open("/usr/share/locale/locale.alias", O_RDONLY) = 3
open("/usr/share/locale/en_US.UTF-8/LC_TIME/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_TIME/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_TIME/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_TIME/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_TIME/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en/LC_TIME/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open(".", O_RDONLY|O_NONBLOCK|O_LARGEFILE|O_DIRECTORY) = 3
open("/etc/nsswitch.conf", O_RDONLY) = 4
open("/etc/ld.so.cache", O_RDONLY) = 4
open("/lib/libnss_files.so.2", O_RDONLY) = 4
open("/etc/passwd", O_RDONLY) = 4
open("/etc/group", O_RDONLY) = 4
open("/etc/passwd", O_RDONLY) = 4
open("/etc/group", O_RDONLY) = 4
open("/etc/mtab", O_RDONLY) = 3
open("/proc/meminfo", O_RDONLY) = 3
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
total 1259664
open("/etc/localtime", O_RDONLY) = 3
drwx------ 14 sandholm wow 4096 Aug 20 16:04 .
drwxr-xr-x 21 root root 4096 Jul 15 13:55 ..
-rw-r--r-- 1 sandholm wow 23477 Jul 31 10:27 any
-rwx------ 1 sandholm wow 404 Jan 2 2007 authorized_keys
-rw------- 1 sandholm wow 16352 Aug 20 13:48 .bash_history
-rw-r--r-- 1 sandholm wow 24 Nov 8 2006 .bash_logout
-rw-r--r-- 1 sandholm wow 257 Jul 5 2007 .bash_profile
-rw-r--r-- 1 sandholm wow 124 Nov 8 2006 .bashrc
-rwxrwxr-x 1 sandholm wow 628 Nov 14 2006 check_login
-rw------- 1 sandholm wow 63 Jul 5 2007 .cvspass
Find out what a program is trying to do on the network:
as root:
strace -e trace=network curl --head http://www.redhat.com
[sandholm@sys1-lnx ~]$ strace -e trace=network curl --head http://www.redhat.com
socket(PF_INET6, SOCK_DGRAM, IPPROTO_IP) = -1 EAFNOSUPPORT (Address family not supported by protocol)
socket(PF_FILE, SOCK_STREAM, 0) = 3
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
socket(PF_FILE, SOCK_STREAM, 0) = 3
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 3
connect(3, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("172.24.0.31")}, 28) = 0
send(3, "$\274\1\0\0\1\0\0\0\0\0\0\3www\6redhat\3com\0\0\1\0\1", 32, MSG_NOSIGNAL) = 32
recvfrom(3, "$\274\201\200\0\1\0\4\0\t\0\4\3www\6redhat\3com\0\0\1\0"..., 1024, 0, {sa_family=AF_INET, sin_port=htons(53), s
in_addr=inet_addr("172.24.0.31")}, [16]) = 405
socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 3
connect(3, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("172.24.0.31")}, 28) = 0
send(3, "?\300\1\0\0\1\0\0\0\0\0\0\003112\00232\003246\00272\7i"..., 44, MSG_NOSIGNAL) = 44
recvfrom(3, "?\300\201\200\0\1\0\1\0\10\0\10\003112\00232\003246\002"..., 1024, 0, {sa_family=AF_INET, sin_port=htons(53), s
in_addr=inet_addr("172.24.0.31")}, [16]) = 399
socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
connect(3, {sa_family=AF_INET, sin_port=htons(80), sin_addr=inet_addr("72.246.32.112")}, 16) = -1 EINPROGRESS (Operation now
in progress)
getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
send(3, "HEAD / HTTP/1.1\r\nUser-Agent: cur"..., 177, 0) = 177
recv(3, "HTTP/1.1 200 OK\r\nServer: Apache\r"..., 16383, 0) = 363
HTTP/1.1 200 OK
Server: Apache
Content-Length: 10819
Content-Type: text/html; charset=ISO-8859-1
Expires: Wed, 20 Aug 2008 20:20:09 GMT
Cache-Control: max-age=0, no-cache, no-store
Pragma: no-cache
Date: Wed, 20 Aug 2008 20:20:09 GMT
Connection: keep-alive
Set-Cookie: Apache=72.246.32.23.1219263609383542; path=/; expires=Mon, 19-Aug-13 20:20:09 GMT
Process 23290 detached
[sandholm@sys1-lnx ~]$