kujirahand/sort_simple.rs

kujirahand · 2026-02-03T03:54:48Z

高速化してみたもの：

use std::fs::File;
use std::error::Error;
use std::io::{Read, Write};

fn main() -> Result<(), Box<dyn Error>> {
    // ファイルを丸ごと読み込み、後続でフィールド参照を切り出す --- (*1)
    let mut data = Vec::new();
    File::open("utf_ken_all.csv")?.read_to_end(&mut data)?;
    if !data.ends_with(b"\n") {
        data.push(b'\n');
    }

    // 行数目安をもとに前取りしてメモリアロケーションを抑制 --- (*2)
    let estimated_rows = 150_000; // 郵便データ行数目安
    let mut entries: Vec<Entry<'_>> = Vec::with_capacity(estimated_rows);

    // バイト列を走査してカンマ位置を探し、必要フィールドだけスライス取得 --- (*3)
    let mut start = 0;
    let mut col = 0;
    let mut fields: [&str; 9] = [""; 9];
    for i in 0..data.len() {
        let b = data[i];
        if b == b',' || b == b'\n' {
            if col < 9 {
                let field_bytes = &data[start..i];
                // CSVのダブルクォートは不要との前提なのでトリムのみで対応
                let field = trim_quotes(field_bytes);
                fields[col] = field;
            }
            col += 1;
            start = i + 1;

            if b == b'\n' {
                if col >= 9 {
                    // 住所カナ(5,6列), 郵便番号(3列), 住所(8,9列)
                    entries.push(Entry {
                        kana1: fields[4],
                        kana2: fields[5],
                        postal: fields[2],
                        addr1: fields[7],
                        addr2: fields[8],
                    });
                }
                // リセット
                col = 0;
                fields = [""; 9];
            }
        }
    }

    // 住所カナをキーに昇順で上位5件だけを部分選択して出力 --- (*7)
    let cmp = |a: &Entry, b: &Entry| match a.kana1.cmp(b.kana1) {
        std::cmp::Ordering::Equal => a.kana2.cmp(b.kana2),
        other => other,
    };

    let k = entries.len().min(5);
    if k > 0 {
        // 部分選択で先頭k件の範囲だけを確定させる
        entries.select_nth_unstable_by(k - 1, cmp);
        // 必要な先頭k件のみを安定・高速にソート
        entries[..k].sort_unstable_by(cmp);

        let mut stdout = std::io::BufWriter::new(std::io::stdout());
        for entry in entries.iter().take(k) {
            writeln!(stdout, "{} {}{}", entry.postal, entry.addr1, entry.addr2)?;
        }
    }

    Ok(())
}

#[derive(Debug, Clone, Copy)]
struct Entry<'a> {
    kana1: &'a str,
    kana2: &'a str,
    postal: &'a str,
    addr1: &'a str,
    addr2: &'a str,
}

// ダブルクォートを外してUTF-8文字列へ変換
fn trim_quotes(bytes: &[u8]) -> &str {
    let mut s = bytes;
    if s.starts_with(b"\"") && s.ends_with(b"\"") && s.len() >= 2 {
        s = &s[1..s.len() - 1];
    }
    std::str::from_utf8(s).unwrap_or("")
}

kujirahand · 2026-02-03T07:35:19Z

メモリ使用量に配慮したプログラム

// メモリを抑えつつ、住所カナ(5+6列目)昇順の先頭5件だけを保持して出力
// ポイント: 全件を保持・ソートせず、ヒープで上位5件のみを管理する
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::fs::File;
use std::io::{BufRead, BufReader, Write};

fn main() -> std::io::Result<()> {
    // 入力CSVをバッファ付きで逐次読み込み（メモリを節約）
    let file = File::open("utf_ken_all.csv")?;
    let reader = BufReader::new(file);

    // 先頭5件だけを保持する最大ヒープ（キーは結合カナ）
    // ヒープのサイズを5に制限し、それより大きくなったら最大要素を捨てる
    let mut heap: BinaryHeap<Entry> = BinaryHeap::new();

    for line in reader.lines() {
        let line = line?;
        if line.is_empty() {
            continue;
        }

        // カンマ分割し、不要なダブルクォートを取り除く
        let fields: Vec<&str> = line
            .split(',')
            .map(|s| s.trim_matches('"'))
            .collect();

        if fields.len() < 9 {
            continue; // 想定より列が少ない行はスキップ
        }

        // キー（住所カナ）と出力項目（郵便番号＋住所）を組み立て
        let kana_key = format!("{}{}", fields[4], fields[5]);
        let postal = fields[2].to_string();
        let address = format!("{}{}", fields[7], fields[8]);

        let entry = Entry { kana_key, postal, address };

        // ヒープへ投入し、サイズが6以上なら最大要素を捨てて5件に抑える
        heap.push(entry);
        if heap.len() > 5 {
            heap.pop(); // 最大ヒープなので「大きい」要素を捨てる＝上位5件を維持
        }
    }

    // ヒープから取り出した5件を昇順に整列させて出力
    // into_sorted_vec() はヒープ順（降順）から昇順ベクタを生成してくれる
    let mut top = heap.into_sorted_vec();
    let mut stdout = std::io::BufWriter::new(std::io::stdout());
    for entry in top.drain(..) {
        writeln!(stdout, "{} {}", entry.postal, entry.address)?;
    }

    Ok(())
}

#[derive(Debug, Eq)]
struct Entry {
    kana_key: String, // ヒープのソートキー（住所カナ結合）
    postal: String,   // 郵便番号
    address: String,  // 住所（8+9列目）
}

impl Ord for Entry {
    fn cmp(&self, other: &Self) -> Ordering {
        // 逆順で保持したいので通常比較を反転
        other.kana_key.cmp(&self.kana_key).then_with(|| other.postal.cmp(&self.postal))
    }
}

impl PartialOrd for Entry {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl PartialEq for Entry {
    fn eq(&self, other: &Self) -> bool {
        self.kana_key == other.kana_key && self.postal == other.postal && self.address == other.address
    }
}

	use std::fs::File;
	use std::io::{BufRead, BufReader};
	use std::error::Error;

	fn main() -> Result<(), Box<dyn Error>> {
	// 対象CSVファイルを開いてバッファ付きリーダーで巻き取る --- (*1)
	let file = File::open("utf_ken_all.csv")?;
	let reader = BufReader::new(file);

	// 住所カナや郵便番号・住所を一時保存する構造体を蓄積 --- (*2)
	let mut entries = Vec::new();

	// 1行ずつ読み込んでCSVフィールドを抽出 --- (*3)
	for line in reader.lines() {
	let line = line?;
	if line.is_empty() {
	continue;
	}
	// フィールドを分割してトリムし、必要な情報を抽出 --- (*4)
	let fields: Vec<String> = line
	.split(',')
	.map(\|s\| s.trim_matches('"').to_string())
	.collect();
	// 住所カタカナ(5列目と6列目)、郵便番号(3列目)、住所(8列目と9列目)を取得 --- (*5)
	let kana_key = format!("{}{}", fields.get(4).map(String::as_str).unwrap_or(""), fields.get(5).map(String::as_str).unwrap_or(""));
	let postal = fields.get(2).map(String::as_str).unwrap_or("").to_string();
	let address = format!("{}{}", fields.get(7).map(String::as_str).unwrap_or(""), fields.get(8).map(String::as_str).unwrap_or(""));
	// 抽出した情報をタプルとしてベクタに追加 --- (*6)
	entries.push((kana_key, postal, address));
	}

	// 住所カナをキーに昇順ソートし、先頭5件を出力 --- (*7)
	entries.sort_by(\|a, b\| a.0.cmp(&b.0));
	for (_, postal, address) in entries.iter().take(5) {
	println!("{} {}", postal, address);
	}

	Ok(())
	}

kujirahand/sort_simple.rs

Select an option

No results found

Select an option

No results found

kujirahand commented Feb 3, 2026 •

edited

Loading

Uh oh!

kujirahand commented Feb 3, 2026

Uh oh!

kujirahand/sort_simple.rs

kujirahand commented Feb 3, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

kujirahand commented Feb 3, 2026

Uh oh!

kujirahand commented Feb 3, 2026 •

edited

Loading