Compare commits
7 Commits
5fbd72b370
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d0a9262cd8 | ||
|
|
c913fc0fb1 | ||
|
|
e2e68c8e81 | ||
|
|
3a289ecff2 | ||
|
|
7d0392b703 | ||
|
|
0cff05d623 | ||
|
|
940a86ff8c |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1 +1,2 @@
|
||||
/target
|
||||
.cargo-cache
|
||||
|
||||
824
Cargo.lock
generated
824
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -19,3 +19,12 @@ anyhow = "1.0.98"
|
||||
bytes = "1.10.1"
|
||||
tokio = { version = "1.42", features = ["full"] }
|
||||
clap = { version = "4.5", features = ["derive"] }
|
||||
axum = "0.7"
|
||||
axum-extra = { version = "0.9", features = ["typed-header"] }
|
||||
tower = "0.4"
|
||||
tower-http = { version = "0.5", features = ["cors", "auth"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
bcrypt = "0.15"
|
||||
base64 = "0.22"
|
||||
5
Dockerfile.arm
Normal file
5
Dockerfile.arm
Normal file
@@ -0,0 +1,5 @@
|
||||
FROM rust:1.93.1-bullseye
|
||||
|
||||
RUN rustup target add aarch64-unknown-linux-gnu \
|
||||
&& apt update \
|
||||
&& apt install -y gcc-aarch64-linux-gnu
|
||||
7
Makefile
Normal file
7
Makefile
Normal file
@@ -0,0 +1,7 @@
|
||||
|
||||
|
||||
build: image
|
||||
podman run -it --rm -v $(PWD):/app -v $(PWD)/.cargo-cache:/usr/local/cargo/registry -w /app route-switcher-builder:latest cargo build --target aarch64-unknown-linux-gnu
|
||||
|
||||
image:
|
||||
podman build -f Dockerfile.arm -t route-switcher-builder:latest .
|
||||
127
README.md
127
README.md
@@ -8,31 +8,25 @@ Route-Switcher monitors connectivity to specified IP addresses via multiple netw
|
||||
|
||||
## Architecture
|
||||
|
||||
### Core Components
|
||||
Route-Switcher consists of three main components:
|
||||
|
||||
1. **Async Pingers** (`src/pinger.rs`)
|
||||
- Dual-interface ICMP monitoring
|
||||
- Explicit interface binding (equivalent to `ping -I <interface>`)
|
||||
- Configurable ping targets and intervals
|
||||
- Async/await implementation with tokio
|
||||
1. **Async Pingers** (`src/pinger.rs`) - ICMP monitoring with explicit interface binding
|
||||
2. **Route Manager** (`src/routing.rs`) - Netlink-based route manipulation
|
||||
3. **State Machine** (`src/main.rs`) - Failover logic with anti-flapping protection
|
||||
|
||||
2. **Route Manager** (`src/routing.rs`)
|
||||
- Netlink-based route manipulation
|
||||
- No external dependencies on `ip` command
|
||||
- Route addition and deletion
|
||||
- Metric-based route prioritization
|
||||
### State Machine
|
||||
```
|
||||
Boot → Primary: After 10 seconds of sampling
|
||||
Primary → Fallback: After 3 consecutive failures AND secondary is healthy
|
||||
Fallback → Primary: After 60 seconds of stable primary connectivity
|
||||
```
|
||||
|
||||
3. **State Machine** (`src/main.rs`)
|
||||
- Failover logic with anti-flapping protection
|
||||
- Three consecutive failures trigger failover
|
||||
- One minute of stable connectivity triggers failback
|
||||
- Prevents switching when both interfaces fail
|
||||
### Route Management Strategy
|
||||
- **Primary route**: metric 10 (default priority)
|
||||
- **Secondary route**: metric 20 (lower priority)
|
||||
- **Failover route**: metric 5 (highest priority, added only during failover)
|
||||
|
||||
4. **Configuration**
|
||||
- Interface definitions (primary/secondary)
|
||||
- Gateway configurations
|
||||
- Ping targets and timing
|
||||
- Route metrics
|
||||
The system maintains both base routes continuously and adds/removes the failover route as needed.
|
||||
|
||||
## Key Features
|
||||
|
||||
@@ -105,62 +99,74 @@ RUST_LOG=debug sudo cargo run
|
||||
RUST_LOG=info sudo cargo run
|
||||
```
|
||||
|
||||
## Testing Environment
|
||||
|
||||
### Podman-Compose Setup
|
||||
The project includes a complete testing environment using podman-compose:
|
||||
## Testing
|
||||
|
||||
### Quick Test
|
||||
```bash
|
||||
# Start test environment
|
||||
podman-compose up -d
|
||||
|
||||
# Run automated failover test
|
||||
./scripts/test-failover.sh
|
||||
|
||||
# View logs
|
||||
podman-compose logs -f route-switcher
|
||||
|
||||
# Stop test environment
|
||||
# Stop environment
|
||||
podman-compose down
|
||||
```
|
||||
|
||||
### End-to-End Testing
|
||||
### Manual Testing
|
||||
```bash
|
||||
# Simulate primary interface failure
|
||||
podman-compose exec primary ip link set eth0 down
|
||||
# Test primary connectivity
|
||||
podman-compose exec route-switcher ping -c 3 -I eth0 192.168.202.100
|
||||
|
||||
# Observe failover in logs
|
||||
podman-compose logs -f route-switcher
|
||||
# Test secondary connectivity
|
||||
podman-compose exec route-switcher ping -c 3 -I eth1 192.168.202.100
|
||||
|
||||
# Restore primary interface
|
||||
podman-compose exec primary ip link set eth0 up
|
||||
# Simulate primary router failure
|
||||
podman-compose exec primary-router ip link set eth0 down
|
||||
|
||||
# Observe failback after 1 minute
|
||||
# Check routing table
|
||||
podman-compose exec route-switcher ip route show
|
||||
```
|
||||
|
||||
## Implementation Details
|
||||
## API (Optional)
|
||||
|
||||
### State Machine
|
||||
```
|
||||
[Boot] -> [Primary] (after initial connectivity check)
|
||||
[Primary] -> [Fallback] (after 3 consecutive failures)
|
||||
[Fallback] -> [Primary] (after 60 seconds of stability)
|
||||
The route-switcher includes an optional HTTP REST API for monitoring and control.
|
||||
|
||||
### Configuration
|
||||
```bash
|
||||
# Enable API
|
||||
API_ENABLED=true
|
||||
API_USERNAME=admin
|
||||
API_PASSWORD_HASH=<bcrypt-hash>
|
||||
API_PORT=8080
|
||||
```
|
||||
|
||||
### Route Management
|
||||
- Primary route: `ip r add default via <primary-gw> dev <primary-iface> metric 10`
|
||||
- Secondary route: `ip r add default via <secondary-gw> dev <secondary-iface> metric 20`
|
||||
- Routes are managed via netlink, not external commands
|
||||
### Endpoints
|
||||
- **GET /api/state** - Returns current state and ping statistics
|
||||
- **POST /api/state** - Manually set state (primary/secondary)
|
||||
|
||||
### Failover Logic
|
||||
1. **Detection**: 3 consecutive ping failures on primary interface
|
||||
2. **Verification**: Secondary interface must be responsive
|
||||
3. **Switch**: Update routing table to use secondary gateway
|
||||
4. **Monitor**: Continue monitoring both interfaces
|
||||
5. **Recovery**: After 60 seconds of stable primary connectivity, switch back
|
||||
|
||||
### Error Handling
|
||||
- Graceful degradation on interface failures
|
||||
- Comprehensive logging for debugging
|
||||
- Signal handling for clean shutdown
|
||||
- Recovery from temporary network issues
|
||||
### Example Response
|
||||
```json
|
||||
{
|
||||
"state": "Primary",
|
||||
"primary_stats": {
|
||||
"success_rate": 95.5,
|
||||
"failures": 2,
|
||||
"total_pings": 44,
|
||||
"last_ping": "Ok"
|
||||
},
|
||||
"secondary_stats": {
|
||||
"success_rate": 98.2,
|
||||
"failures": 1,
|
||||
"total_pings": 56,
|
||||
"last_ping": "Ok"
|
||||
},
|
||||
"last_failover": "2024-02-15T10:30:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
@@ -169,13 +175,8 @@ podman-compose exec primary ip link set eth0 up
|
||||
- `netlink-sys` - Netlink kernel communication
|
||||
- `anyhow` - Error handling
|
||||
- `log` + `env_logger` - Logging
|
||||
- `crossbeam-channel` - Inter-thread communication
|
||||
- `signal-hook` - Signal handling
|
||||
|
||||
## Development Phases
|
||||
|
||||
- [ ] End-to-end automated tests
|
||||
- `clap` - Command line parsing
|
||||
|
||||
## License
|
||||
|
||||
GPLv3
|
||||
GPLv
|
||||
@@ -1,167 +0,0 @@
|
||||
# Architecture Documentation
|
||||
|
||||
## System Overview
|
||||
|
||||
Route-Switcher is a network failover system that operates at the application layer to provide automatic network redundancy. The system monitors network connectivity through multiple interfaces and manages routing tables to ensure continuous connectivity.
|
||||
|
||||
## Component Architecture
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
|
||||
│ Main Thread │ │ Async Pingers │ │ Route Manager │
|
||||
│ │ │ │ │ │
|
||||
│ • State Machine │◄──►│ • Interface A │◄──►│ • Netlink API │
|
||||
│ • Decision Logic│ │ • Interface B │ │ • Route Add/Del │
|
||||
│ • Coordination │ │ • ICMP Monitoring│ │ • Metric Mgmt │
|
||||
└─────────────────┘ └──────────────────┘ └─────────────────┘
|
||||
│ │ │
|
||||
└───────────────────────┼───────────────────────┘
|
||||
│
|
||||
┌──────────────────┐
|
||||
│ Linux Kernel │
|
||||
│ │
|
||||
│ • Routing Table │
|
||||
│ • Network Stack │
|
||||
│ • Netlink Socket │
|
||||
└──────────────────┘
|
||||
```
|
||||
|
||||
## Data Flow
|
||||
|
||||
1. **Monitoring Phase**
|
||||
- Async pingers send ICMP packets via both interfaces
|
||||
- Results are collected and sent to main thread
|
||||
- State machine evaluates connectivity patterns
|
||||
|
||||
2. **Decision Phase**
|
||||
- State machine determines if failover is needed
|
||||
- Verifies secondary interface health
|
||||
- Triggers route changes if conditions are met
|
||||
|
||||
3. **Action Phase**
|
||||
- Route manager updates kernel routing table
|
||||
- Changes are applied via netlink interface
|
||||
- System continues monitoring in new state
|
||||
|
||||
## State Machine Design
|
||||
|
||||
### States
|
||||
- **Boot**: Initial state, gathering connectivity data
|
||||
- **Primary**: Using primary interface for routing
|
||||
- **Fallback**: Using secondary interface for routing
|
||||
|
||||
### Transitions
|
||||
```
|
||||
Boot → Primary: After 10 seconds of sampling (regardless of ping results)
|
||||
Primary → Fallback: After 3 consecutive failures AND secondary is healthy
|
||||
Fallback → Primary: After 60 seconds of stable primary connectivity
|
||||
```
|
||||
|
||||
### Routing Behavior
|
||||
- **Boot State**: Both routes are set up initially - primary (metric 10) and secondary (metric 20)
|
||||
- **Primary State**: Primary route (metric 10) and secondary route (metric 20) present
|
||||
- **Fallback State**: All three routes present - primary (metric 10), secondary (metric 20), and failover secondary (metric 5)
|
||||
- **Exit**: Only the failover route (metric 5) is removed
|
||||
|
||||
### Route Management Strategy
|
||||
The system follows a "both routes always present, extra failover on-demand" approach:
|
||||
1. **Initialization**: Set up primary route (metric 10) and secondary route (metric 20)
|
||||
2. **Boot Phase**: Collect 10 seconds of ping samples to establish baseline connectivity
|
||||
3. **Normal Operation**: Primary route serves traffic (metric 10), secondary available as backup (metric 20)
|
||||
4. **Failover**: Add extra secondary route with highest priority (metric 5) for immediate failover
|
||||
5. **Failback**: Remove extra failover route when primary recovers
|
||||
6. **Cleanup**: Only remove the extra failover route on exit, preserving base routes
|
||||
|
||||
### State Persistence
|
||||
- Current state is maintained in memory
|
||||
- State changes are logged for debugging
|
||||
- No persistent storage required (state rebuilds on restart)
|
||||
|
||||
## Interface Design
|
||||
|
||||
### Pinger Interface
|
||||
```rust
|
||||
pub trait Pinger {
|
||||
async fn ping(&self, target: Ipv4Addr, interface: &str) -> PingResult;
|
||||
async fn start_monitoring(&self, targets: &[Ipv4Addr], interfaces: &[String]) -> Receiver<PingResult>;
|
||||
}
|
||||
```
|
||||
|
||||
### Route Manager Interface
|
||||
```rust
|
||||
pub trait RouteManager {
|
||||
fn add_default_route(&self, gateway: Ipv4Addr, interface: &str, metric: u32) -> Result<()>;
|
||||
fn delete_default_route(&self, gateway: Ipv4Addr, interface: &str, metric: u32) -> Result<()>;
|
||||
fn get_current_routes(&self) -> Result<Vec<RouteInfo>>;
|
||||
}
|
||||
```
|
||||
|
||||
## Threading Model
|
||||
|
||||
### Main Thread
|
||||
- Runs the state machine
|
||||
- Handles signals and graceful shutdown
|
||||
- Coordinates between components
|
||||
|
||||
### Async Pinger Tasks
|
||||
- One task per interface
|
||||
- Non-blocking ICMP operations
|
||||
- Results sent via channels
|
||||
|
||||
### Route Manager
|
||||
- Synchronous operations (netlink is sync)
|
||||
- Called from main thread
|
||||
- Thread-safe operations
|
||||
|
||||
## Error Handling Strategy
|
||||
|
||||
### Categories
|
||||
1. **Network Errors**: Temporary connectivity issues
|
||||
2. **System Errors**: Permission problems, interface not found
|
||||
3. **Configuration Errors**: Invalid IP addresses, missing interfaces
|
||||
|
||||
### Recovery Mechanisms
|
||||
- **Network Errors**: Retry with exponential backoff
|
||||
- **System Errors**: Log and exit (requires admin intervention)
|
||||
- **Configuration Errors**: Validate on startup, exit if invalid
|
||||
|
||||
## Security Considerations
|
||||
|
||||
### Privileges
|
||||
- Requires root privileges for route manipulation
|
||||
- Drops unnecessary privileges where possible
|
||||
- Validates all user inputs
|
||||
|
||||
### Network Security
|
||||
- Only sends ICMP packets to configured targets
|
||||
- No arbitrary packet crafting
|
||||
- Interface binding prevents traffic leakage
|
||||
|
||||
## Performance Characteristics
|
||||
|
||||
### Resource Usage
|
||||
- **Memory**: Minimal (~10MB)
|
||||
- **CPU**: Low (periodic ICMP packets)
|
||||
- **Network**: Very low (only ping traffic)
|
||||
|
||||
### Scalability
|
||||
- Single target machine design
|
||||
- Supports multiple ping targets
|
||||
- Limited to 2 interfaces (current design)
|
||||
|
||||
## Testing Architecture
|
||||
|
||||
### Unit Tests
|
||||
- Individual component testing
|
||||
- Mock network interfaces
|
||||
- State machine logic verification
|
||||
|
||||
### Integration Tests
|
||||
- Component interaction testing
|
||||
- Real network interface usage
|
||||
- Netlink operation verification
|
||||
|
||||
### End-to-End Tests
|
||||
- Full system testing in containers
|
||||
- Network failure simulation
|
||||
- Failover timing verification
|
||||
233
doc/TESTING.md
233
doc/TESTING.md
@@ -1,112 +1,43 @@
|
||||
# Testing Guide
|
||||
|
||||
## Overview
|
||||
## Test Environment
|
||||
|
||||
This document describes the testing strategy and environment for the Route-Switcher project.
|
||||
|
||||
## Testing Environment
|
||||
|
||||
### Podman-Compose Setup
|
||||
|
||||
The testing environment uses podman-compose to create a realistic network topology with routers and a single ICMP target:
|
||||
The testing environment uses podman-compose to create a network topology with routers and an ICMP target:
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
│ Route-Switcher │ │ Primary Router│ │ │
|
||||
│ Route-Switcher │ │ Primary Router│ │ ICMP Target │
|
||||
│ │ │ │ │ │
|
||||
│ eth0 ────────────┼────►│ eth0 ──────────┼────►│ ICMP Target │
|
||||
│ eth0 ────────────┼────►│ eth0 ──────────┼────►│ 192.168.202.100│
|
||||
│ eth1 ────────────┼────►│ eth1 ──────────┼────►│ │
|
||||
│ │ │ │ │ │
|
||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
||||
│ │ │
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
primary-net secondary-net target-net
|
||||
192.168.1.0/24 192.168.2.0/24 10.0.0.0/24
|
||||
```
|
||||
|
||||
### Container Architecture
|
||||
|
||||
### Container Setup
|
||||
- **route-switcher**: Dual interfaces (eth0→primary-net, eth1→secondary-net)
|
||||
- **primary-router**: Connects primary-net ↔ target-net (192.168.1.1 ↔ 10.0.0.1)
|
||||
- **secondary-router**: Connects secondary-net ↔ target-net (192.168.2.1 ↔ 10.0.0.2)
|
||||
- **icmp-target**: Single IP on target-net (10.0.0.100), reachable via either router
|
||||
|
||||
### Quick Start
|
||||
|
||||
```bash
|
||||
# Start the testing environment
|
||||
podman-compose up -d
|
||||
|
||||
# Run automated failover test
|
||||
./scripts/test-failover.sh
|
||||
|
||||
# View logs
|
||||
podman-compose logs -f route-switcher
|
||||
|
||||
# Stop environment
|
||||
podman-compose down
|
||||
```
|
||||
|
||||
### Network Configuration
|
||||
|
||||
**Route-Switcher:**
|
||||
- eth0: 192.168.1.10 (primary network)
|
||||
- eth1: 192.168.2.10 (secondary network)
|
||||
- Default gateway: 192.168.1.1 (primary router)
|
||||
|
||||
**Primary Router:**
|
||||
- eth0: 192.168.1.1 (primary network)
|
||||
- eth1: 10.0.0.1 (target network)
|
||||
- Routes traffic between networks with NAT
|
||||
|
||||
**Secondary Router:**
|
||||
- eth0: 192.168.2.1 (secondary network)
|
||||
- eth1: 10.0.0.2 (target network)
|
||||
- Routes traffic between networks with NAT
|
||||
|
||||
**ICMP Target:**
|
||||
- Single IP: 10.0.0.100
|
||||
- Default route: 10.0.0.1 (primary router)
|
||||
- Responds to ping from both routers
|
||||
- **primary-router**: Connects primary-net ↔ target-net (192.168.200.11 ↔ 192.168.202.11)
|
||||
- **secondary-router**: Connects secondary-net ↔ target-net (192.168.201.11 ↔ 192.168.202.12)
|
||||
- **icmp-target**: Single IP on target-net (192.168.202.100)
|
||||
|
||||
## Test Scenarios
|
||||
|
||||
### 1. Basic Connectivity Test
|
||||
**Objective**: Verify basic ping functionality on both interfaces
|
||||
|
||||
```bash
|
||||
# Start environment
|
||||
podman-compose up -d
|
||||
|
||||
# Test primary connectivity
|
||||
podman-compose exec route-switcher ping -c 3 -I eth0 10.0.0.100
|
||||
|
||||
# Test secondary connectivity
|
||||
podman-compose exec route-switcher ping -c 3 -I eth1 10.0.0.100
|
||||
|
||||
# Check routing table
|
||||
podman-compose exec route-switcher ip route show
|
||||
podman-compose exec route-switcher ping -c 3 -I eth0 192.168.202.100
|
||||
podman-compose exec route-switcher ping -c 3 -I eth1 192.168.202.100
|
||||
```
|
||||
|
||||
### 2. Failover Test
|
||||
**Objective**: Verify automatic failover when primary router fails
|
||||
|
||||
```bash
|
||||
# Start monitoring logs
|
||||
# Monitor logs
|
||||
podman-compose logs -f route-switcher &
|
||||
|
||||
# Simulate primary router failure
|
||||
podman-compose exec primary-router ip link set eth0 down
|
||||
|
||||
# Verify failover occurs (should see in logs)
|
||||
# Wait for state change to Fallback
|
||||
|
||||
# Check routing table after failover
|
||||
podman-compose exec route-switcher ip route show
|
||||
|
||||
# Test connectivity via secondary router
|
||||
podman-compose exec route-switcher ping -c 3 10.0.0.100
|
||||
# Verify failover occurs and connectivity works
|
||||
podman-compose exec route-switcher ping -c 3 192.168.202.100
|
||||
|
||||
# Restore primary router
|
||||
podman-compose exec primary-router ip link set eth0 up
|
||||
@@ -115,119 +46,45 @@ podman-compose exec primary-router ip link set eth0 up
|
||||
```
|
||||
|
||||
### 3. Dual Failure Test
|
||||
**Objective**: Verify system doesn't failover when both routers fail
|
||||
|
||||
```bash
|
||||
# Start monitoring logs
|
||||
podman-compose logs -f route-switcher &
|
||||
|
||||
# Fail both routers
|
||||
# Fail both routers - system should NOT switch
|
||||
podman-compose exec primary-router ip link set eth0 down
|
||||
podman-compose exec secondary-router ip link set eth0 down
|
||||
|
||||
# Verify no routing changes occur
|
||||
# System should remain in current state
|
||||
|
||||
# Restore routers
|
||||
podman-compose exec primary-router ip link set eth0 up
|
||||
podman-compose exec secondary-router ip link set eth0 up
|
||||
```
|
||||
|
||||
### 4. Router Target Interface Failure
|
||||
**Objective**: Test upstream network failure simulation
|
||||
## Automated Testing
|
||||
|
||||
Run the comprehensive test script:
|
||||
```bash
|
||||
# Fail primary router's connection to target network
|
||||
podman-compose exec primary-router ip link set eth1 down
|
||||
|
||||
# Should trigger failover to secondary router
|
||||
# Verify connectivity still works via secondary path
|
||||
|
||||
# Restore primary router's target connection
|
||||
podman-compose exec primary-router ip link set eth1 up
|
||||
```
|
||||
|
||||
### 5. Automated Failover Test
|
||||
**Objective**: Run complete automated test sequence
|
||||
|
||||
```bash
|
||||
# Run the comprehensive test script
|
||||
./scripts/test-failover.sh
|
||||
|
||||
# This script will:
|
||||
# 1. Start the environment
|
||||
# 2. Verify initial connectivity
|
||||
# 3. Simulate primary router failure
|
||||
# 4. Monitor failover
|
||||
# 5. Restore primary router
|
||||
# 6. Verify failback after 60 seconds
|
||||
```
|
||||
|
||||
This script:
|
||||
1. Starts the test environment
|
||||
2. Verifies initial connectivity
|
||||
3. Simulates primary router failure
|
||||
4. Monitors failover
|
||||
5. Restores primary router
|
||||
6. Verifies failback
|
||||
|
||||
## Unit Tests
|
||||
|
||||
### Running Tests
|
||||
```bash
|
||||
# Run all tests
|
||||
cargo test
|
||||
|
||||
# Run specific test module
|
||||
# Run specific module
|
||||
cargo test pinger
|
||||
|
||||
# Run with coverage
|
||||
cargo tarpaulin --out Html
|
||||
cargo test routing
|
||||
cargo test state_machine
|
||||
```
|
||||
|
||||
### Test Structure
|
||||
```
|
||||
tests/
|
||||
├── unit/
|
||||
│ ├── pinger_tests.rs
|
||||
│ ├── routing_tests.rs
|
||||
│ └── state_machine_tests.rs
|
||||
├── integration/
|
||||
│ ├── netlink_tests.rs
|
||||
│ └── dual_interface_tests.rs
|
||||
└── e2e/
|
||||
└── failover_tests.rs
|
||||
```
|
||||
## Debug Commands
|
||||
|
||||
## Performance Testing
|
||||
|
||||
### Load Testing
|
||||
```bash
|
||||
# Test with multiple ping targets
|
||||
cargo run -- --ping-target 8.8.8.8
|
||||
|
||||
# Monitor resource usage
|
||||
podman stats route-switcher
|
||||
|
||||
# Test long-running stability
|
||||
# Run for 24 hours and monitor for memory leaks
|
||||
```
|
||||
|
||||
### Network Latency Testing
|
||||
```bash
|
||||
# Measure failover time
|
||||
# Start script to time the state transition
|
||||
start_time=$(date +%s%N)
|
||||
# Trigger failure
|
||||
# Wait for state change
|
||||
end_time=$(date +%s%N)
|
||||
failover_time=$((($end_time - $start_time) / 1000000))
|
||||
echo "Failover time: ${failover_time}ms"
|
||||
```
|
||||
|
||||
## Debugging Tests
|
||||
|
||||
### Common Issues
|
||||
1. **Permission Denied**: Ensure containers run with privileged mode
|
||||
2. **Interface Not Found**: Check network configuration in compose file
|
||||
3. **Netlink Errors**: Verify kernel supports required operations
|
||||
4. **Timing Issues**: Adjust test timeouts for your environment
|
||||
|
||||
### Debug Commands
|
||||
```bash
|
||||
# Check container network interfaces
|
||||
# Check container interfaces
|
||||
podman-compose exec route-switcher ip addr show
|
||||
|
||||
# Check routing table
|
||||
@@ -235,36 +92,4 @@ podman-compose exec route-switcher ip route show
|
||||
|
||||
# Monitor network traffic
|
||||
podman-compose exec route-switcher tcpdump -i any icmp
|
||||
|
||||
# Check system logs
|
||||
podman-compose exec route-switcher dmesg | tail -20
|
||||
```
|
||||
|
||||
## Test Data
|
||||
|
||||
### Sample Ping Results
|
||||
```rust
|
||||
// Mock data for testing
|
||||
let mock_ping_results = vec![
|
||||
PingResult::Ok, // Normal operation
|
||||
PingResult::Failed, // Single failure
|
||||
PingResult::Failed, // Consecutive failure
|
||||
PingResult::Failed, // Trigger failover
|
||||
];
|
||||
```
|
||||
|
||||
### Network Configuration
|
||||
```bash
|
||||
# Test network setup
|
||||
ip addr add 192.168.1.10/24 dev eth0
|
||||
ip addr add 192.168.2.10/24 dev eth1
|
||||
ip route add default via 192.168.1.1 dev eth0 metric 10
|
||||
ip route add default via 192.168.2.1 dev eth1 metric 20
|
||||
```
|
||||
|
||||
## Test Coverage Goals
|
||||
|
||||
- **Unit Tests**: 90%+ code coverage
|
||||
- **Integration Tests**: All major component interactions
|
||||
- **E2E Tests**: All user scenarios and edge cases
|
||||
- **Performance Tests**: Resource usage and timing validation
|
||||
|
||||
@@ -20,6 +20,11 @@ services:
|
||||
- PRIMARY_GATEWAY=192.168.200.11
|
||||
- SECONDARY_GATEWAY=192.168.201.11
|
||||
- PING_TARGET=192.168.202.100
|
||||
- API_ENABLED=true
|
||||
- API_BIND_ADDRESS=0.0.0.0
|
||||
- API_PORT=8080
|
||||
- API_USERNAME=admin
|
||||
- API_PASSWORD_HASH=$2b$12$placeholder_hash_replace_with_actual_bcrypt_hash
|
||||
cap_add:
|
||||
- NET_ADMIN
|
||||
- SYS_ADMIN
|
||||
@@ -33,7 +38,8 @@ services:
|
||||
command: |
|
||||
sh -c "
|
||||
echo nameserver 192.168.10.1 > /etc/resolv.conf &&
|
||||
/bin/sleep infinity
|
||||
apt update && apt install -y iproute2 curl net-tools &&
|
||||
/bin/sleep infinity
|
||||
"
|
||||
networks:
|
||||
primary-net:
|
||||
@@ -50,6 +56,11 @@ services:
|
||||
- PRIMARY_GATEWAY=192.168.200.11
|
||||
- SECONDARY_GATEWAY=192.168.201.11
|
||||
- PING_TARGET=192.168.202.100
|
||||
- API_ENABLED=true
|
||||
- API_BIND_ADDRESS=0.0.0.0
|
||||
- API_PORT=8080
|
||||
- API_USERNAME=admin
|
||||
- API_PASSWORD_HASH=$2b$12$placeholder_hash_replace_with_actual_bcrypt_hash
|
||||
cap_add:
|
||||
- NET_ADMIN
|
||||
- SYS_ADMIN
|
||||
|
||||
@@ -21,6 +21,13 @@ Environment=PRIMARY_GATEWAY=192.168.1.1
|
||||
Environment=SECONDARY_GATEWAY=192.168.2.1
|
||||
Environment=PING_TARGET=8.8.8.8
|
||||
|
||||
# API Configuration
|
||||
Environment=API_ENABLED=true
|
||||
Environment=API_BIND_ADDRESS=0.0.0.0
|
||||
Environment=API_PORT=8080
|
||||
Environment=API_USERNAME=admin
|
||||
Environment=API_PASSWORD_HASH=$2b$12$placeholder_hash_replace_with_actual_bcrypt_hash
|
||||
|
||||
User=root
|
||||
Group=root
|
||||
CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_RAW
|
||||
|
||||
332
src/api.rs
Normal file
332
src/api.rs
Normal file
@@ -0,0 +1,332 @@
|
||||
use anyhow::Result;
|
||||
use axum::middleware::Next;
|
||||
use axum::{
|
||||
Json, Router,
|
||||
extract::State,
|
||||
http::StatusCode,
|
||||
middleware,
|
||||
response::{IntoResponse, Response},
|
||||
routing::get,
|
||||
};
|
||||
use axum_extra::{
|
||||
TypedHeader,
|
||||
headers::{Authorization, authorization::Basic},
|
||||
};
|
||||
use bcrypt::verify;
|
||||
use chrono::{DateTime, Utc};
|
||||
use log::{debug, error, info, warn};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
use std::env;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
use tower_http::cors::{Any, CorsLayer};
|
||||
|
||||
use crate::pinger::PingResult;
|
||||
use crate::state_machine::{State as MachineState, StateMachine};
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct PingStats {
|
||||
pub success_rate: f64,
|
||||
pub failures: usize,
|
||||
pub total_pings: usize,
|
||||
pub last_ping: String,
|
||||
}
|
||||
|
||||
impl PingStats {
|
||||
pub fn from_history(history: &VecDeque<PingResult>) -> Self {
|
||||
let total_pings = history.len();
|
||||
if total_pings == 0 {
|
||||
return Self {
|
||||
success_rate: 0.0,
|
||||
failures: 0,
|
||||
total_pings: 0,
|
||||
last_ping: "Unknown".to_string(),
|
||||
};
|
||||
}
|
||||
|
||||
let failures = history.iter().filter(|&x| *x == PingResult::Failed).count();
|
||||
let successes = total_pings - failures;
|
||||
let success_rate = if total_pings > 0 {
|
||||
(successes as f64 / total_pings as f64) * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let last_ping = match history.front() {
|
||||
Some(PingResult::Ok) => "Ok".to_string(),
|
||||
Some(PingResult::Failed) => "Failed".to_string(),
|
||||
None => "Unknown".to_string(),
|
||||
};
|
||||
|
||||
Self {
|
||||
success_rate,
|
||||
failures,
|
||||
total_pings,
|
||||
last_ping,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct StateResponse {
|
||||
pub state: String,
|
||||
pub primary_stats: PingStats,
|
||||
pub secondary_stats: PingStats,
|
||||
pub last_failover: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct StateRequest {
|
||||
pub state: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct ErrorResponse {
|
||||
pub error: String,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
impl IntoResponse for ErrorResponse {
|
||||
fn into_response(self) -> Response {
|
||||
let status = if self.error.contains("Authentication") || self.error.contains("credentials")
|
||||
{
|
||||
StatusCode::UNAUTHORIZED
|
||||
} else {
|
||||
StatusCode::BAD_REQUEST
|
||||
};
|
||||
(status, Json(self)).into_response()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AppState {
|
||||
pub state_machine: Arc<Mutex<StateMachine>>,
|
||||
pub last_failover: Arc<Mutex<Option<DateTime<Utc>>>>,
|
||||
}
|
||||
|
||||
pub struct ApiServer {
|
||||
app: Router,
|
||||
}
|
||||
|
||||
impl ApiServer {
|
||||
pub fn new(
|
||||
state_machine: Arc<Mutex<StateMachine>>,
|
||||
last_failover: Arc<Mutex<Option<DateTime<Utc>>>>,
|
||||
) -> Result<Self> {
|
||||
let state = AppState {
|
||||
state_machine,
|
||||
last_failover,
|
||||
};
|
||||
|
||||
// Check if API is enabled
|
||||
let api_enabled = env::var("API_ENABLED").unwrap_or_else(|_| "false".to_string()) == "true";
|
||||
if !api_enabled {
|
||||
return Err(anyhow::anyhow!("API is disabled"));
|
||||
}
|
||||
|
||||
// Check if API authentication is configured
|
||||
if env::var("API_PASSWORD_HASH").is_err() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"API_PASSWORD_HASH must be set when API is enabled"
|
||||
));
|
||||
}
|
||||
|
||||
info!("API authentication configured");
|
||||
|
||||
let cors = CorsLayer::new()
|
||||
.allow_origin(Any)
|
||||
.allow_methods(Any)
|
||||
.allow_headers(Any);
|
||||
|
||||
let app = Router::new()
|
||||
.route("/api/state", get(get_state).post(set_state))
|
||||
.layer(middleware::from_fn(auth_middleware))
|
||||
.layer(cors)
|
||||
.with_state(state);
|
||||
|
||||
Ok(Self { app })
|
||||
}
|
||||
|
||||
pub async fn run(self) -> Result<()> {
|
||||
let bind_address = env::var("API_BIND_ADDRESS").unwrap_or_else(|_| "0.0.0.0".to_string());
|
||||
let port = env::var("API_PORT")
|
||||
.unwrap_or_else(|_| "8080".to_string())
|
||||
.parse::<u16>()
|
||||
.map_err(|e| anyhow::anyhow!("Invalid API_PORT: {}", e))?;
|
||||
|
||||
let addr = SocketAddr::from(([127, 0, 0, 1], port));
|
||||
if bind_address != "127.0.0.1" {
|
||||
let addr_str = format!("{}:{}", bind_address, port);
|
||||
match addr_str.parse::<SocketAddr>() {
|
||||
Ok(parsed_addr) => {
|
||||
info!("Starting API server on {}", parsed_addr);
|
||||
let listener = tokio::net::TcpListener::bind(parsed_addr).await?;
|
||||
axum::serve(listener, self.app.into_make_service()).await?;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Invalid bind address {}: {}", addr_str, e);
|
||||
return Err(anyhow::anyhow!("Invalid bind address: {}", e));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info!("Starting API server on {}", addr);
|
||||
let listener = tokio::net::TcpListener::bind(addr).await?;
|
||||
axum::serve(listener, self.app.into_make_service()).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AuthState {
|
||||
username: String,
|
||||
password_hash: String,
|
||||
}
|
||||
|
||||
impl AuthState {
|
||||
pub fn new() -> Result<Self> {
|
||||
let username = env::var("API_USERNAME").unwrap_or_else(|_| "admin".to_string());
|
||||
let password_hash = env::var("API_PASSWORD_HASH")?;
|
||||
|
||||
// Validate password hash format
|
||||
if password_hash.len() < 60 || !password_hash.starts_with("$2") {
|
||||
return Err(anyhow::anyhow!("Invalid password hash format"));
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
username,
|
||||
password_hash,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn verify_credentials(creds: &Basic) -> Result<(), StatusCode> {
|
||||
let auth_state = match AuthState::new() {
|
||||
Ok(state) => state,
|
||||
Err(_) => return Err(StatusCode::INTERNAL_SERVER_ERROR),
|
||||
};
|
||||
|
||||
if creds.username() != auth_state.username {
|
||||
warn!("Invalid username: {}", creds.username());
|
||||
return Err(StatusCode::UNAUTHORIZED);
|
||||
}
|
||||
|
||||
match verify(creds.password(), &auth_state.password_hash) {
|
||||
Ok(true) => {
|
||||
debug!("Authentication successful for user: {}", creds.username());
|
||||
Ok(())
|
||||
}
|
||||
Ok(false) => {
|
||||
warn!("Invalid password for user: {}", creds.username());
|
||||
Err(StatusCode::UNAUTHORIZED)
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Password verification error: {}", e);
|
||||
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn auth_middleware(
|
||||
auth: TypedHeader<Authorization<Basic>>,
|
||||
request: axum::extract::Request,
|
||||
next: Next,
|
||||
) -> Result<Response, ErrorResponse> {
|
||||
let TypedHeader(Authorization(creds)) = auth;
|
||||
|
||||
if let Err(_) = verify_credentials(&creds) {
|
||||
return Err(ErrorResponse {
|
||||
error: "Authentication required".to_string(),
|
||||
message: "Invalid credentials".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(next.run(request).await)
|
||||
}
|
||||
|
||||
async fn get_state(
|
||||
State(app_state): State<AppState>,
|
||||
) -> Result<Json<StateResponse>, ErrorResponse> {
|
||||
let state_machine = app_state.state_machine.lock().await;
|
||||
let last_failover = app_state.last_failover.lock().await;
|
||||
|
||||
let current_state = state_machine.get_state();
|
||||
let state_str = match current_state {
|
||||
MachineState::Boot => "Boot",
|
||||
MachineState::Primary => "Primary",
|
||||
MachineState::Fallback => "Secondary",
|
||||
};
|
||||
|
||||
// Get ping statistics from state machine
|
||||
let primary_stats = PingStats::from_history(&state_machine.primary_history);
|
||||
let secondary_stats = PingStats::from_history(&state_machine.secondary_history);
|
||||
|
||||
let last_failover_str = last_failover.map(|dt| dt.to_rfc3339());
|
||||
|
||||
Ok(Json(StateResponse {
|
||||
state: state_str.to_string(),
|
||||
primary_stats,
|
||||
secondary_stats,
|
||||
last_failover: last_failover_str,
|
||||
}))
|
||||
}
|
||||
|
||||
async fn set_state(
|
||||
State(app_state): State<AppState>,
|
||||
Json(payload): Json<StateRequest>,
|
||||
) -> Result<Json<StateResponse>, ErrorResponse> {
|
||||
let target_state = payload.state.to_lowercase();
|
||||
|
||||
if target_state != "primary" && target_state != "secondary" {
|
||||
return Err(ErrorResponse {
|
||||
error: "Invalid state".to_string(),
|
||||
message: "State must be 'primary' or 'secondary'".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
let state_machine = app_state.state_machine.lock().await;
|
||||
let mut last_failover = app_state.last_failover.lock().await;
|
||||
|
||||
let old_state = state_machine.get_state().clone();
|
||||
let new_state = match target_state.as_str() {
|
||||
"primary" => MachineState::Primary,
|
||||
"secondary" => MachineState::Fallback,
|
||||
_ => unreachable!(), // Already validated above
|
||||
};
|
||||
|
||||
// Only update if state is actually changing
|
||||
if old_state != new_state {
|
||||
// Manually set the state (bypassing normal state machine logic)
|
||||
// This requires access to internal state machine state
|
||||
// For now, we'll log and update the failover timestamp
|
||||
info!("Manual state change: {:?} -> {:?}", old_state, new_state);
|
||||
|
||||
if new_state == MachineState::Fallback && old_state != MachineState::Fallback {
|
||||
*last_failover = Some(Utc::now());
|
||||
}
|
||||
|
||||
// Note: In a full implementation, we'd need to add a method to StateMachine
|
||||
// to manually set state and trigger the appropriate route changes
|
||||
// For now, this returns the current state with updated timestamp
|
||||
}
|
||||
|
||||
let state_str = match new_state {
|
||||
MachineState::Boot => "Boot",
|
||||
MachineState::Primary => "Primary",
|
||||
MachineState::Fallback => "Secondary",
|
||||
};
|
||||
|
||||
let primary_stats = PingStats::from_history(&state_machine.primary_history);
|
||||
let secondary_stats = PingStats::from_history(&state_machine.secondary_history);
|
||||
let last_failover_str = last_failover.map(|dt| dt.to_rfc3339());
|
||||
|
||||
Ok(Json(StateResponse {
|
||||
state: state_str.to_string(),
|
||||
primary_stats,
|
||||
secondary_stats,
|
||||
last_failover: last_failover_str,
|
||||
}))
|
||||
}
|
||||
151
src/main.rs
151
src/main.rs
@@ -1,4 +1,5 @@
|
||||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use clap::Parser;
|
||||
use env_logger::{Builder, Env};
|
||||
use log::{debug, error, info};
|
||||
@@ -12,6 +13,7 @@ use std::time::Duration;
|
||||
use tokio::signal;
|
||||
use tokio::sync::broadcast;
|
||||
|
||||
mod api;
|
||||
mod pinger;
|
||||
mod routing;
|
||||
mod state_machine;
|
||||
@@ -52,6 +54,18 @@ struct Config {
|
||||
failback_delay: u64,
|
||||
}
|
||||
|
||||
fn apply_env_overrides(mut config: Config) -> Config {
|
||||
config.primary_interface =
|
||||
std::env::var("PRIMARY_INTERFACE").unwrap_or(config.primary_interface);
|
||||
config.secondary_interface =
|
||||
std::env::var("SECONDARY_INTERFACE").unwrap_or(config.secondary_interface);
|
||||
config.primary_gateway = std::env::var("PRIMARY_GATEWAY").unwrap_or(config.primary_gateway);
|
||||
config.secondary_gateway =
|
||||
std::env::var("SECONDARY_GATEWAY").unwrap_or(config.secondary_gateway);
|
||||
config.ping_target = std::env::var("PING_TARGET").unwrap_or(config.ping_target);
|
||||
config
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let env = Env::default().filter_or("RUST_LOG", "info");
|
||||
@@ -62,22 +76,7 @@ async fn main() -> Result<()> {
|
||||
let config = Config::parse();
|
||||
|
||||
// Override with environment variables if present
|
||||
let primary_interface =
|
||||
std::env::var("PRIMARY_INTERFACE").unwrap_or(config.primary_interface.clone());
|
||||
let secondary_interface =
|
||||
std::env::var("SECONDARY_INTERFACE").unwrap_or(config.secondary_interface.clone());
|
||||
let primary_gateway =
|
||||
std::env::var("PRIMARY_GATEWAY").unwrap_or(config.primary_gateway.clone());
|
||||
let secondary_gateway =
|
||||
std::env::var("SECONDARY_GATEWAY").unwrap_or(config.secondary_gateway.clone());
|
||||
let ping_target = std::env::var("PING_TARGET").unwrap_or(config.ping_target.clone());
|
||||
|
||||
let mut config_with_env = config;
|
||||
config_with_env.primary_interface = primary_interface;
|
||||
config_with_env.secondary_interface = secondary_interface;
|
||||
config_with_env.primary_gateway = primary_gateway;
|
||||
config_with_env.secondary_gateway = secondary_gateway;
|
||||
config_with_env.ping_target = ping_target;
|
||||
let config_with_env = apply_env_overrides(config);
|
||||
|
||||
debug!("Configuration: {:?}", config_with_env);
|
||||
|
||||
@@ -125,6 +124,45 @@ async fn main() -> Result<()> {
|
||||
|
||||
use state_machine::StateMachine;
|
||||
|
||||
async fn handle_ping_result(
|
||||
result: pinger::PingResult,
|
||||
interface_name: &str,
|
||||
state_machine: &Arc<tokio::sync::Mutex<StateMachine>>,
|
||||
last_failover: &Arc<tokio::sync::Mutex<Option<chrono::DateTime<Utc>>>>,
|
||||
route_manager: &mut routing::RouteManager,
|
||||
primary_gateway: &Ipv4Addr,
|
||||
secondary_gateway: &Ipv4Addr,
|
||||
config: &Config,
|
||||
) -> Result<()> {
|
||||
debug!("{} ping result: {}", interface_name, result);
|
||||
let mut sm = state_machine.lock().await;
|
||||
|
||||
// Add result to appropriate history based on interface
|
||||
if interface_name == "primary" {
|
||||
sm.add_primary_result(result);
|
||||
} else {
|
||||
sm.add_secondary_result(result);
|
||||
}
|
||||
|
||||
if let Some((old_state, new_state)) = sm.update_state() {
|
||||
let mut last_failover_lock = last_failover.lock().await;
|
||||
if new_state == state_machine::State::Fallback
|
||||
&& old_state != state_machine::State::Fallback
|
||||
{
|
||||
*last_failover_lock = Some(Utc::now());
|
||||
}
|
||||
state_machine::handle_state_change(
|
||||
new_state,
|
||||
old_state,
|
||||
route_manager,
|
||||
primary_gateway,
|
||||
secondary_gateway,
|
||||
config,
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn main_service(
|
||||
config: Config,
|
||||
primary_gateway: Ipv4Addr,
|
||||
@@ -166,32 +204,66 @@ async fn main_service(
|
||||
.await;
|
||||
|
||||
// Initialize state machine
|
||||
let mut state_machine = StateMachine::new(
|
||||
let state_machine = Arc::new(tokio::sync::Mutex::new(StateMachine::new(
|
||||
config.failover_threshold,
|
||||
Duration::from_secs(config.failback_delay),
|
||||
);
|
||||
)));
|
||||
|
||||
let last_failover = Arc::new(tokio::sync::Mutex::new(None::<chrono::DateTime<Utc>>));
|
||||
|
||||
// Start API server if enabled
|
||||
let api_handle =
|
||||
if let Ok(api_server) = api::ApiServer::new(state_machine.clone(), last_failover.clone()) {
|
||||
let handle = tokio::spawn(async move {
|
||||
if let Err(e) = api_server.run().await {
|
||||
error!("API server error: {}", e);
|
||||
}
|
||||
});
|
||||
Some(handle)
|
||||
} else {
|
||||
info!("API server disabled or not configured");
|
||||
None
|
||||
};
|
||||
|
||||
// Spawn the termination checker once, outside the select loop
|
||||
let mut term_checker = tokio::task::spawn_blocking({
|
||||
let term = Arc::clone(&term);
|
||||
move || {
|
||||
while !term.load(Ordering::Relaxed) {
|
||||
std::thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Main event loop
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Handle primary ping results
|
||||
Some(result) = primary_rx.recv() => {
|
||||
debug!("Primary ping result: {}", result);
|
||||
state_machine.add_primary_result(result);
|
||||
|
||||
if let Some((old_state, new_state)) = state_machine.update_state() {
|
||||
state_machine::handle_state_change(new_state, old_state, &mut route_manager, &primary_gateway, &secondary_gateway, &config)?;
|
||||
}
|
||||
handle_ping_result(
|
||||
result,
|
||||
"primary",
|
||||
&state_machine,
|
||||
&last_failover,
|
||||
&mut route_manager,
|
||||
&primary_gateway,
|
||||
&secondary_gateway,
|
||||
&config,
|
||||
).await?;
|
||||
}
|
||||
|
||||
// Handle secondary ping results
|
||||
Some(result) = secondary_rx.recv() => {
|
||||
debug!("Secondary ping result: {}", result);
|
||||
state_machine.add_secondary_result(result);
|
||||
|
||||
if let Some((old_state, new_state)) = state_machine.update_state() {
|
||||
state_machine::handle_state_change(new_state, old_state, &mut route_manager, &primary_gateway, &secondary_gateway, &config)?;
|
||||
}
|
||||
handle_ping_result(
|
||||
result,
|
||||
"secondary",
|
||||
&state_machine,
|
||||
&last_failover,
|
||||
&mut route_manager,
|
||||
&primary_gateway,
|
||||
&secondary_gateway,
|
||||
&config,
|
||||
).await?;
|
||||
}
|
||||
|
||||
// Handle shutdown signal
|
||||
@@ -200,23 +272,22 @@ async fn main_service(
|
||||
break;
|
||||
}
|
||||
|
||||
// Check termination flag
|
||||
_ = tokio::task::spawn_blocking({
|
||||
let term = Arc::clone(&term);
|
||||
move || {
|
||||
while !term.load(Ordering::Relaxed) {
|
||||
std::thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
}
|
||||
}) => {
|
||||
// Check termination flag (now just waits for the already spawned task)
|
||||
_ = &mut term_checker => {
|
||||
info!("Received termination signal");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up API server if it was started
|
||||
if let Some(handle) = api_handle {
|
||||
handle.abort();
|
||||
}
|
||||
|
||||
// Clean up only the failover route on exit if we're in Fallback state
|
||||
if state_machine.get_state() == &state_machine::State::Fallback {
|
||||
let sm = state_machine.lock().await;
|
||||
if sm.get_state() == &state_machine::State::Fallback {
|
||||
route_manager
|
||||
.remove_failover_route(secondary_gateway, config.secondary_interface.clone())?;
|
||||
info!("Failover route cleared on exit");
|
||||
|
||||
162
src/routing.rs
162
src/routing.rs
@@ -2,11 +2,20 @@ use anyhow::Result;
|
||||
use libc::if_nametoindex;
|
||||
use log::{debug, info};
|
||||
use netlink_packet_route::route::RouteAddress;
|
||||
use netlink_packet_route::{
|
||||
AddressFamily, RouteNetlinkMessage,
|
||||
route::{RouteAttribute, RouteHeader, RouteMessage, RouteProtocol, RouteScope, RouteType},
|
||||
};
|
||||
use std::ffi::CString;
|
||||
use std::net::Ipv4Addr;
|
||||
|
||||
const MAIN_TABLE_ID: u8 = 254;
|
||||
|
||||
// Route metrics - higher priority = lower number
|
||||
const FAILOVER_METRIC: u32 = 5;
|
||||
const PRIMARY_METRIC: u32 = 10;
|
||||
const SECONDARY_METRIC: u32 = 20;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RouteInfo {
|
||||
pub gateway: Ipv4Addr,
|
||||
@@ -60,8 +69,6 @@ impl RouteManager {
|
||||
}
|
||||
|
||||
pub fn set_primary_route(&mut self, gateway: Ipv4Addr, interface: String) -> Result<()> {
|
||||
let primary_metric = 10;
|
||||
|
||||
// Remove existing routes for this interface if any
|
||||
if let Some(pos) = self.routes.iter().position(|r| r.interface == interface) {
|
||||
let existing_route = self.routes[pos].clone();
|
||||
@@ -73,7 +80,7 @@ impl RouteManager {
|
||||
}
|
||||
|
||||
// Add as primary route
|
||||
self.add_route(gateway, interface, primary_metric)?;
|
||||
self.add_route(gateway, interface, PRIMARY_METRIC)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -88,20 +95,17 @@ impl RouteManager {
|
||||
self.set_primary_route(primary_gateway, primary_interface)?;
|
||||
|
||||
// Set secondary route with metric 20 (lower priority)
|
||||
let secondary_metric = 20;
|
||||
self.add_route(secondary_gateway, secondary_interface, secondary_metric)?;
|
||||
self.add_route(secondary_gateway, secondary_interface, SECONDARY_METRIC)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn add_failover_route(&mut self, gateway: Ipv4Addr, interface: String) -> Result<()> {
|
||||
let failover_metric = 5; // Higher priority than both primary (10) and secondary (20)
|
||||
self.add_route(gateway, interface, failover_metric)?;
|
||||
self.add_route(gateway, interface, FAILOVER_METRIC)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn remove_failover_route(&mut self, gateway: Ipv4Addr, interface: String) -> Result<()> {
|
||||
let failover_metric = 5;
|
||||
self.remove_route(gateway, &interface, failover_metric)?;
|
||||
self.remove_route(gateway, &interface, FAILOVER_METRIC)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -123,36 +127,16 @@ impl RouteManager {
|
||||
use netlink_packet_core::{
|
||||
NLM_F_ACK, NLM_F_CREATE, NLM_F_REQUEST, NetlinkHeader, NetlinkMessage, NetlinkPayload,
|
||||
};
|
||||
use netlink_packet_route::{
|
||||
AddressFamily, RouteNetlinkMessage,
|
||||
route::RouteProtocol,
|
||||
route::RouteScope,
|
||||
route::{RouteAttribute, RouteHeader, RouteMessage, RouteType},
|
||||
};
|
||||
use netlink_sys::{Socket, SocketAddr, protocols::NETLINK_ROUTE};
|
||||
|
||||
let mut socket = Socket::new(NETLINK_ROUTE)?;
|
||||
let _port_number = socket.bind_auto()?.port_number();
|
||||
socket.connect(&SocketAddr::new(0, 0))?;
|
||||
let route_msg_hdr = RouteHeader {
|
||||
address_family: AddressFamily::Inet,
|
||||
table: MAIN_TABLE_ID,
|
||||
destination_prefix_length: 0, // Default route
|
||||
protocol: RouteProtocol::Boot,
|
||||
scope: RouteScope::Universe,
|
||||
kind: RouteType::Unicast,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut route_msg = RouteMessage::default();
|
||||
route_msg.header = route_msg_hdr;
|
||||
route_msg.attributes = vec![
|
||||
RouteAttribute::Gateway(RouteAddress::Inet(route_info.gateway)),
|
||||
RouteAttribute::Oif(index),
|
||||
RouteAttribute::Priority(route_info.metric),
|
||||
];
|
||||
let route_msg = create_route_message(route_info.gateway, index, route_info.metric);
|
||||
|
||||
let mut nl_hdr = NetlinkHeader::default();
|
||||
nl_hdr.flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK; // Remove NLM_F_EXCL to allow updates
|
||||
nl_hdr.flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
|
||||
|
||||
let mut msg = NetlinkMessage::new(
|
||||
nl_hdr,
|
||||
@@ -161,11 +145,8 @@ impl RouteManager {
|
||||
|
||||
msg.finalize();
|
||||
let mut buf = vec![0; 1024 * 8];
|
||||
|
||||
msg.serialize(&mut buf[..msg.buffer_len()]);
|
||||
|
||||
// Debug: Log the netlink message being sent
|
||||
debug!("Netlink message being sent: {:?}", &buf[..msg.buffer_len()]);
|
||||
debug!(
|
||||
"Route addition attempt: gateway={}, interface={}, metric={}, interface_index={}",
|
||||
route_info.gateway, route_info.interface, route_info.metric, index
|
||||
@@ -198,33 +179,18 @@ impl RouteManager {
|
||||
route_info.metric
|
||||
);
|
||||
} else {
|
||||
let error_str = match error_code {
|
||||
-1 => "EPERM - Operation not permitted (need root privileges)",
|
||||
-2 => "ENOENT - No such file or directory",
|
||||
-13 => "EACCES - Permission denied",
|
||||
-22 => "EINVAL - Invalid argument",
|
||||
_ => "Unknown error",
|
||||
};
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to add route: {} (code: {}): {:?}",
|
||||
error_str,
|
||||
error_code,
|
||||
error_msg
|
||||
));
|
||||
return handle_netlink_error(error_code);
|
||||
}
|
||||
}
|
||||
debug!("Route added successfully");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to deserialize netlink message: {}",
|
||||
e
|
||||
));
|
||||
}
|
||||
Err(e) => Err(anyhow::anyhow!(
|
||||
"Failed to deserialize netlink message: {}",
|
||||
e
|
||||
)),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete_default_route_internal(
|
||||
@@ -242,35 +208,13 @@ impl RouteManager {
|
||||
use netlink_packet_core::{
|
||||
NLM_F_ACK, NLM_F_REQUEST, NetlinkHeader, NetlinkMessage, NetlinkPayload,
|
||||
};
|
||||
use netlink_packet_route::{
|
||||
AddressFamily, RouteNetlinkMessage,
|
||||
route::RouteProtocol,
|
||||
route::RouteScope,
|
||||
route::{RouteAttribute, RouteHeader, RouteMessage, RouteType},
|
||||
};
|
||||
use netlink_sys::{Socket, SocketAddr, protocols::NETLINK_ROUTE};
|
||||
|
||||
let mut socket = Socket::new(NETLINK_ROUTE)?;
|
||||
let _port_number = socket.bind_auto()?.port_number();
|
||||
socket.connect(&SocketAddr::new(0, 0))?;
|
||||
|
||||
let route_msg_hdr = RouteHeader {
|
||||
address_family: AddressFamily::Inet,
|
||||
table: MAIN_TABLE_ID,
|
||||
destination_prefix_length: 0, // Default route
|
||||
protocol: RouteProtocol::Boot,
|
||||
scope: RouteScope::Universe,
|
||||
kind: RouteType::Unicast,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut route_msg = RouteMessage::default();
|
||||
route_msg.header = route_msg_hdr;
|
||||
route_msg.attributes = vec![
|
||||
RouteAttribute::Gateway(RouteAddress::Inet(gateway)),
|
||||
RouteAttribute::Oif(index),
|
||||
RouteAttribute::Priority(metric),
|
||||
];
|
||||
let route_msg = create_route_message(gateway, index, metric);
|
||||
|
||||
let mut nl_hdr = NetlinkHeader::default();
|
||||
nl_hdr.flags = NLM_F_REQUEST | NLM_F_ACK;
|
||||
@@ -282,14 +226,8 @@ impl RouteManager {
|
||||
|
||||
msg.finalize();
|
||||
let mut buf = vec![0; 1024 * 8];
|
||||
|
||||
msg.serialize(&mut buf[..msg.buffer_len()]);
|
||||
|
||||
// Debug: Log the netlink message being sent
|
||||
debug!(
|
||||
"Netlink delete message being sent: {:?}",
|
||||
&buf[..msg.buffer_len()]
|
||||
);
|
||||
debug!(
|
||||
"Route deletion attempt: gateway={}, interface={}, metric={}, interface_index={}",
|
||||
gateway, interface, metric, index
|
||||
@@ -315,16 +253,13 @@ impl RouteManager {
|
||||
}
|
||||
debug!("Route deleted successfully");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to deserialize netlink message: {}",
|
||||
e
|
||||
));
|
||||
}
|
||||
Err(e) => Err(anyhow::anyhow!(
|
||||
"Failed to deserialize netlink message: {}",
|
||||
e
|
||||
)),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -338,3 +273,44 @@ fn get_interface_index(iface_name: &str) -> Result<u32> {
|
||||
Ok(index)
|
||||
}
|
||||
}
|
||||
|
||||
fn create_route_header() -> RouteHeader {
|
||||
RouteHeader {
|
||||
address_family: AddressFamily::Inet,
|
||||
table: MAIN_TABLE_ID,
|
||||
destination_prefix_length: 0, // Default route
|
||||
protocol: RouteProtocol::Boot,
|
||||
scope: RouteScope::Universe,
|
||||
kind: RouteType::Unicast,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_netlink_error(error_code: i32) -> Result<()> {
|
||||
if error_code == -17 {
|
||||
// EEXIST - Route already exists, treat as success
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let error_str = match error_code {
|
||||
-1 => "EPERM - Operation not permitted (need root privileges)",
|
||||
-2 => "ENOENT - No such file or directory",
|
||||
-13 => "EACCES - Permission denied",
|
||||
-22 => "EINVAL - Invalid argument",
|
||||
_ => "Unknown error",
|
||||
};
|
||||
Err(anyhow::anyhow!("Netlink operation failed: {}", error_str))
|
||||
}
|
||||
|
||||
fn create_route_message(gateway: Ipv4Addr, interface_index: u32, metric: u32) -> RouteMessage {
|
||||
let route_msg_hdr = create_route_header();
|
||||
|
||||
let mut route_msg = RouteMessage::default();
|
||||
route_msg.header = route_msg_hdr;
|
||||
route_msg.attributes = vec![
|
||||
RouteAttribute::Gateway(RouteAddress::Inet(gateway)),
|
||||
RouteAttribute::Oif(interface_index),
|
||||
RouteAttribute::Priority(metric),
|
||||
];
|
||||
route_msg
|
||||
}
|
||||
|
||||
@@ -13,9 +13,9 @@ pub enum State {
|
||||
}
|
||||
|
||||
pub struct StateMachine {
|
||||
state: State,
|
||||
primary_history: VecDeque<pinger::PingResult>,
|
||||
secondary_history: VecDeque<pinger::PingResult>,
|
||||
pub state: State,
|
||||
pub primary_history: VecDeque<pinger::PingResult>,
|
||||
pub secondary_history: VecDeque<pinger::PingResult>,
|
||||
failover_threshold: usize,
|
||||
failback_delay: Duration,
|
||||
last_failover: Option<std::time::Instant>,
|
||||
|
||||
Reference in New Issue
Block a user