#deep-learning #computer-vision #pytorch #machine-learning

torsh-vision

Computer vision utilities for ToRSh deep learning framework

1 unstable release

0.1.0-alpha.1 Sep 30, 2025

#35 in #pytorch

Download history 117/week @ 2025-09-29 23/week @ 2025-10-06 20/week @ 2025-10-13 13/week @ 2025-10-20 3/week @ 2025-10-27

68 downloads per month
Used in 2 crates

MIT/Apache

7MB
155K SLoC

torsh-vision

Computer vision utilities and models for ToRSh, leveraging scirs2-vision for optimized image operations.

Overview

This crate provides comprehensive computer vision functionality:

  • Image I/O: Loading, saving, and format conversion
  • Transforms: Data augmentation and preprocessing
  • Datasets: Common vision datasets (ImageNet, COCO, etc.)
  • Models: Pre-trained vision models
  • Operations: Image processing and computer vision algorithms
  • Visualization: Image display and annotation utilities

Note: This crate integrates with scirs2-vision for optimized image processing operations.

Usage

Image I/O

use torsh_vision::prelude::*;

// Load images
let image = read_image("path/to/image.jpg")?;
let batch = read_image_batch(&["img1.jpg", "img2.jpg", "img3.jpg"])?;

// Save images
write_image(&tensor, "output.png")?;
write_video(&frames, "output.mp4", fps=30)?;

// Format conversion
let rgb = bgr_to_rgb(&bgr_image)?;
let gray = rgb_to_grayscale(&rgb_image)?;
let hsv = rgb_to_hsv(&rgb_image)?;

Transforms

use torsh_vision::transforms::*;

// Create transform pipeline
let transform = Compose::new(vec![
    Box::new(Resize::new(256)),
    Box::new(CenterCrop::new(224)),
    Box::new(ToTensor::new()),
    Box::new(Normalize::new(
        vec![0.485, 0.456, 0.406],  // ImageNet mean
        vec![0.229, 0.224, 0.225],  // ImageNet std
    )),
]);

let transformed = transform.apply(&image)?;

// Data augmentation
let augment = Compose::new(vec![
    Box::new(RandomResizedCrop::new(224, scale=(0.08, 1.0))),
    Box::new(RandomHorizontalFlip::new(0.5)),
    Box::new(ColorJitter::new(0.4, 0.4, 0.4, 0.1)),
    Box::new(RandomErasing::new(0.2)),
    Box::new(ToTensor::new()),
    Box::new(Normalize::imagenet()),
]);

// Advanced augmentations
let mixup = MixUp::new(alpha=1.0);
let cutmix = CutMix::new(alpha=1.0);
let augmix = AugMix::new(severity=3, width=3, depth=-1, alpha=1.0);

Datasets

use torsh_vision::datasets::*;

// ImageNet dataset
let imagenet = ImageNet::new(
    root="./data/imagenet",
    split="train",
    transform=Some(transform),
    download=false,
)?;

// COCO dataset
let coco = COCODetection::new(
    root="./data/coco",
    split="train2017",
    transform=Some(transform),
    target_transform=None,
)?;

// CIFAR datasets
let cifar10 = CIFAR10::new(
    root="./data",
    train=true,
    transform=Some(transform),
    download=true,
)?;

// Custom folder dataset
let dataset = ImageFolder::new(
    root="./data/custom",
    transform=Some(transform),
    extensions=Some(vec!["jpg", "jpeg", "png"]),
)?;

// Video dataset
let video_dataset = VideoFolder::new(
    root="./data/videos",
    clip_len=16,
    frame_interval=1,
    num_clips=1,
    transform=Some(video_transform),
)?;

Pre-trained Models

use torsh_vision::models::*;

// Classification models
let resnet = resnet50(pretrained=true, num_classes=1000)?;
let efficientnet = efficientnet_b0(pretrained=true)?;
let vit = vit_base_patch16_224(pretrained=true)?;

// Object detection
let faster_rcnn = fasterrcnn_resnet50_fpn(
    pretrained=true,
    num_classes=91,
    pretrained_backbone=true,
)?;

// Segmentation
let deeplabv3 = deeplabv3_resnet101(
    pretrained=true,
    num_classes=21,
    aux_loss=true,
)?;

// Feature extraction
let features = resnet.features(&input)?;
let backbone = create_feature_extractor(
    &resnet,
    return_nodes=vec!["layer1", "layer2", "layer3", "layer4"],
)?;

Image Operations

use torsh_vision::ops::*;

// Basic operations (leveraging scirs2-vision)
let resized = resize(&image, size=[224, 224], interpolation="bilinear")?;
let cropped = crop(&image, top=10, left=10, height=200, width=200)?;
let flipped = hflip(&image)?;
let rotated = rotate(&image, angle=45.0, fill=vec![0, 0, 0])?;

// Filtering
let blurred = gaussian_blur(&image, kernel_size=[5, 5], sigma=[1.0, 1.0])?;
let sharpened = adjust_sharpness(&image, sharpness_factor=2.0)?;
let edge = sobel_edge_detection(&image)?;

// Color adjustments
let bright = adjust_brightness(&image, brightness_factor=1.5)?;
let contrast = adjust_contrast(&image, contrast_factor=1.5)?;
let saturated = adjust_saturation(&image, saturation_factor=1.5)?;

// Advanced operations
let slic = slic_superpixels(&image, n_segments=100, compactness=10.0)?;
let optical_flow = dense_optical_flow(&frame1, &frame2, method="farneback")?;

Object Detection Utilities

use torsh_vision::utils::*;

// Bounding box operations
let iou = box_iou(&boxes1, &boxes2)?;
let nms_keep = nms(&boxes, &scores, iou_threshold=0.5)?;
let converted = box_convert(&boxes, in_fmt="xyxy", out_fmt="cxcywh")?;

// Anchor generation
let anchors = AnchorGenerator::new(
    sizes=vec![vec![32], vec![64], vec![128], vec![256], vec![512]],
    aspect_ratios=vec![vec![0.5, 1.0, 2.0]; 5],
)?;

// ROI operations
let roi_pool = roi_pool(&features, &boxes, output_size=[7, 7], spatial_scale=0.25)?;
let roi_align = roi_align(&features, &boxes, output_size=[7, 7], spatial_scale=0.25)?;

Visualization

use torsh_vision::utils::*;

// Draw bounding boxes
let annotated = draw_bounding_boxes(
    &image,
    &boxes,
    labels=Some(&labels),
    colors=None,
    width=2,
)?;

// Draw segmentation masks
let masked = draw_segmentation_masks(
    &image,
    &masks,
    alpha=0.7,
    colors=None,
)?;

// Draw keypoints
let keypoint_image = draw_keypoints(
    &image,
    &keypoints,
    connectivity=Some(&COCO_PERSON_SKELETON),
    colors=None,
    radius=3,
)?;

// Create image grid
let grid = make_grid(
    &tensor_list,
    nrow=8,
    padding=2,
    normalize=true,
    value_range=None,
)?;

// Save visualization
save_image(&grid, "visualization.png")?;

Video Processing

use torsh_vision::video::*;

// Read video
let video = read_video("input.mp4", start_pts=0, end_pts=None)?;
let frames = video.frames;  // Tensor of shape [T, C, H, W]
let audio = video.audio;    // Optional audio tensor

// Write video
write_video(
    "output.mp4",
    &frames,
    fps=30.0,
    video_codec="h264",
    audio=audio,
    audio_codec="aac",
)?;

// Video transforms
let video_transform = VideoCompose::new(vec![
    Box::new(VideoResize::new(256)),
    Box::new(VideoCenterCrop::new(224)),
    Box::new(VideoNormalize::imagenet()),
]);

Feature Extraction and Similarity

// Extract features
let feature_extractor = create_feature_extractor(
    &model,
    return_nodes=vec!["avgpool"],
)?;
let features = feature_extractor(&images)?;

// Image similarity
let similarity = cosine_similarity(&features1, &features2)?;

// Image retrieval
let retrieval_system = ImageRetrieval::new(feature_extractor);
retrieval_system.add_images(&database_images)?;
let similar_images = retrieval_system.search(&query_image, top_k=10)?;

Integration with SciRS2

This crate leverages scirs2-vision for:

  • Optimized image processing operations
  • Efficient data augmentation
  • Hardware-accelerated transforms
  • Computer vision algorithms

License

Licensed under either of

at your option.

Dependencies

~168MB
~3M SLoC