Tak2 ·
2025.04.14 ·
조회 1
DSc Neural Language Engine update
<p><br></p><p><br></p><div class="t2-code-block"><pre><code class="" contenteditable="true"><?php
include_once('./_common.php');
// stx 파라미터 값 처리
$stx = isset($_GET['stx']) ? trim($_GET['stx']) : '';
header('Content-Type: application/json; charset=UTF-8');
// 입력값 검증
if (empty($stx) || mb_strlen($stx, 'UTF-8') < 2) {
echo json_encode(['correction' => '', 'related' => []]);
exit;
}
// 캐시 설정
$cache_key = 'search_suggest_' . md5($stx);
$cache_file = G5_DATA_PATH . '/cache/' . $cache_key . '.php';
$cache_ttl = 3600; // 1시간
// 캐시 확인
if (file_exists($cache_file) && (time() - filemtime($cache_file)) < $cache_ttl) {
$cached_data = include $cache_file;
echo json_encode($cached_data);
exit;
}
// 한글 문자를 위한 유니코드 변환 함수
function uniord($c) {
$h = ord($c[0]);
if ($h <= 0x7F) return $h;
if ($h < 0xC2) return false;
if ($h <= 0xDF) return (($h & 0x1F) << 6) | (ord($c[1]) & 0x3F);
if ($h <= 0xEF) return (($h & 0x0F) << 12) | ((ord($c[1]) & 0x3F) << 6) | (ord($c[2]) & 0x3F);
if ($h <= 0xF4) return (($h & 0x07) << 18) | ((ord($c[1]) & 0x3F) << 12) | ((ord($c[2]) & 0x3F) << 6) | (ord($c[3]) & 0x3F);
return false;
}
// 초성 추출 함수
function extractInitials($text) {
static $choseong = ['ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'];
$initials = '';
$text_len = mb_strlen($text, 'UTF-8');
for ($i = 0; $i < $text_len; $i++) {
$char = mb_substr($text, $i, 1, 'UTF-8');
$code = uniord($char);
if ($code >= 0xAC00 && $code <= 0xD7A3) {
$initials .= $choseong[intval(($code - 0xAC00) / 588)];
} else {
$initials .= $char;
}
}
return $initials;
}
// 받침 검사 함수
function hasFinalConsonant($char) {
$code = uniord($char);
return ($code >= 0xAC00 && $code <= 0xD7A3) && (($code - 0xAC00) % 28 !== 0);
}
// 조사/접미사 패턴 확인 함수
function hasParticleOrSuffix($word) {
static $patterns = [
'이란', '이라는', '이라고', '이라면', '이었던', '이었다', '이었고', '으로', '으로써', '으로서', '로는',
'에서', '에서는', '에서도', '라는', '라고', '라면', '라도', '에게', '에게는', '에게서', '에게도',
'과의', '과는', '과도', '와의', '와는', '와도', '보다', '보다는', '부터', '까지', '만큼', '처럼',
'이다', '입니다', '이고', '이며', '이나', '하는', '하다', '했던', '했다', '했고', '하고',
'된다', '됩니다', '되고', '되는', '되며', '되어', '되었', '인', '인데', '인지', '이지', '이네',
'이요', '이에요', '이예요', '이어요', '지만', '지요', '지는', '지도', '지가'
];
$word_len = mb_strlen($word, 'UTF-8');
foreach ($patterns as $pattern) {
$pattern_len = mb_strlen($pattern, 'UTF-8');
if ($word_len > $pattern_len && mb_substr($word, -$pattern_len, null, 'UTF-8') === $pattern) {
return mb_strlen(mb_substr($word, 0, $word_len - $pattern_len, 'UTF-8')) >= 2;
}
}
return false;
}
// 조사 제거 함수
function removeKoreanParticles($word) {
static $affixes = [
'에게서', '한테서', '으로서', '으로써', '부터는', '까지는', '에게도', '한테도', '에서는', '로도',
'에게는', '한테는', '으로도', '과는', '와는', '보다는', '같이는', '만큼', '같이', '에게', '한테',
'으로', '에서', '부터', '까지', '보다', '이나', '이랑', '이며', '이고', '은', '는', '이', '가',
'을', '를', '의', '에', '과', '와', '로', '만', '도', '들', '이란', '이라는', '이라고', '이라면',
'라는', '라고', '라면', '라도', '이지', '이네', '이요', '이에요', '이예요', '이어요', '지만', '지요',
'그리고', '그러나', '하지만', '또한', '그런데', '따라서', '그래서', '하다', '되다', '적', '화', '성',
'인', '인데', '인지'
];
$word_len = mb_strlen($word, 'UTF-8');
if ($word_len <= 2) return $word;
$original = $word;
while (true) {
$changed = false;
foreach ($affixes as $affix) {
$affix_len = mb_strlen($affix, 'UTF-8');
if ($word_len <= $affix_len) continue;
if (mb_substr($word, -$affix_len, null, 'UTF-8') === $affix) {
$stem = mb_substr($word, 0, $word_len - $affix_len, 'UTF-8');
if (mb_strlen($stem, 'UTF-8') < 2) continue;
if (in_array($affix, ['이', '은', '을', '으로']) && !hasFinalConsonant(mb_substr($stem, -1, 1, 'UTF-8'))) continue;
if (in_array($affix, ['가', '는', '를', '로']) && hasFinalConsonant(mb_substr($stem, -1, 1, 'UTF-8'))) continue;
$word = $stem;
$word_len = mb_strlen($word, 'UTF-8');
$changed = true;
break;
}
}
if (!$changed) break;
}
return ($word !== $original && $word_len >= 2) ? $word : $original;
}
// 유사도 계산 함수
function calculateOverallSimilarity($word1, $word2) {
if (strtolower($word1) === strtolower($word2)) return 1.0;
$len1 = mb_strlen($word1, 'UTF-8');
$len2 = mb_strlen($word2, 'UTF-8');
if (min($len1, $len2) / max($len1, $len2) < 0.5) return 0.1;
$prefix_len = 0;
$max_prefix = min($len1, $len2);
for ($i = 0; $i < $max_prefix && mb_substr($word1, $i, 1, 'UTF-8') === mb_substr($word2, $i, 1, 'UTF-8'); $i++) {
$prefix_len++;
}
$lev_distance = improved_levenshtein($word1, $word2);
$text_similarity = 1 - ($lev_distance / max($len1, $len2));
$initials1 = extractInitials($word1);
$initials2 = extractInitials($word2);
$init_len1 = mb_strlen($initials1, 'UTF-8');
$init_len2 = mb_strlen($initials2, 'UTF-8');
$init_lev = improved_levenshtein($initials1, $initials2);
$initial_similarity = 1 - ($init_lev / max($init_len1, $init_len2));
$position_score = 0;
$total_positions = min($init_len1, $init_len2);
if ($total_positions > 0) {
$matches = 0;
$weight_sum = 0;
for ($i = 0; $i < $total_positions; $i++) {
$weight = ($i == 0) ? 3.0 : ($i == 1 ? 2.0 : 1.0);
$weight_sum += $weight;
if (mb_substr($initials1, $i, 1, 'UTF-8') === mb_substr($initials2, $i, 1, 'UTF-8')) {
$matches += $weight;
}
}
$position_score = $matches / $weight_sum;
}
$final_score = ($text_similarity * 0.4) + ($initial_similarity * 0.3) + ($position_score * 0.3);
if ($prefix_len >= 2 && $len1 == $len2 && $lev_distance <= 2) {
$final_score = max($final_score, 0.85);
}
return min($final_score, 1.0);
}
// 레벤슈타인 거리 계산
function improved_levenshtein($str1, $str2) {
$len1 = mb_strlen($str1, 'UTF-8');
$len2 = mb_strlen($str2, 'UTF-8');
if ($len1 == 0) return $len2;
if ($len2 == 0) return $len1;
$matrix = array_fill(0, $len1 + 1, array_fill(0, $len2 + 1, 0));
for ($i = 0; $i <= $len1; $i++) $matrix[$i][0] = $i;
for ($j = 0; $j <= $len2; $j++) $matrix[0][$j] = $j;
$chars1 = preg_split('//u', $str1, -1, PREG_SPLIT_NO_EMPTY);
$chars2 = preg_split('//u', $str2, -1, PREG_SPLIT_NO_EMPTY);
for ($i = 1; $i <= $len1; $i++) {
for ($j = 1; $j <= $len2; $j++) {
$cost = ($chars1[$i-1] === $chars2[$j-1]) ? 0 : 1.0;
if ($cost > 0) {
$code1 = uniord($chars1[$i-1]);
$code2 = uniord($chars2[$j-1]);
if ($code1 >= 0xAC00 && $code1 <= 0xD7A3 && $code2 >= 0xAC00 && $code2 <= 0xD7A3) {
$cho1 = intval(($code1 - 0xAC00) / 588);
$cho2 = intval(($code2 - 0xAC00) / 588);
$jung1 = intval(($code1 - 0xAC00) / 28) % 21;
$jung2 = intval(($code2 - 0xAC00) / 28) % 21;
if ($cho1 === $cho2) $cost -= 0.3;
if ($jung1 === $jung2) $cost -= 0.4;
}
}
$matrix[$i][$j] = min(
$matrix[$i-1][$j] + 1,
$matrix[$i][$j-1] + 1,
$matrix[$i-1][$j-1] + $cost
);
}
}
return $matrix[$len1][$len2];
}
// 후보 단어 가져오기
function getCandidates() {
$candidates = [];
// 인기 검색어
$sql = "SELECT pp_word FROM g5_popular WHERE pp_date >= DATE_SUB(CURDATE(), INTERVAL 7 DAY) ORDER BY pp_count DESC LIMIT 50";
$result = sql_query($sql);
while ($row = sql_fetch_array($result)) {
$candidates[] = $row['pp_word'];
}
// 게시판 데이터
$boards = ['g5_write_code', 'g5_write_gallery', 'g5_write_free', 'g5_write_it', 'g5_write_zip', 'g5_write_blog'];
foreach ($boards as $board) {
$sql = "SELECT wr_subject FROM $board WHERE wr_is_comment = 0 ORDER BY wr_datetime DESC LIMIT 20";
$result = sql_query($sql);
while ($row = sql_fetch_array($result)) {
preg_match_all('/[\pL\pN]+/u', $row['wr_subject'], $matches);
foreach ($matches[0] as $word) {
if (mb_strlen($word, 'UTF-8') >= 2) {
$candidates[] = $word;
}
}
}
}
return array_unique($candidates);
}
// 후보 단어 필터링
function getCleanCandidates($candidates, $clean_stx) {
$stx_initials = extractInitials($clean_stx);
$stx_len = mb_strlen($clean_stx, 'UTF-8');
$priority = [];
$others = [];
foreach ($candidates as $term) {
if (hasParticleOrSuffix($term)) continue;
$clean_term = removeKoreanParticles($term);
if (mb_strlen($clean_term, 'UTF-8') < 2) continue;
$term_initials = extractInitials($clean_term);
$term_len = mb_strlen($clean_term, 'UTF-8');
if ($term_initials === $stx_initials && $term_len === $stx_len) {
$priority[$clean_term] = true;
} else {
$others[$clean_term] = true;
}
}
return array_merge(array_keys($priority), array_keys($others));
}
// 메인 처리
$clean_stx = removeKoreanParticles($stx);
$candidates = getCandidates();
$clean_candidates = getCleanCandidates($candidates, $clean_stx);
$exact_match = false;
foreach ($clean_candidates as $term) {
if (strtolower($term) === strtolower($clean_stx)) {
$exact_match = true;
break;
}
}
$correction = '';
$related_with_scores = []; // 유사도 점수와 함께 저장할 배열
if (preg_match('/[\x{AC00}-\x{D7A3}]/u', $stx)) {
foreach ($clean_candidates as $term) {
if (strtolower($term) === strtolower($clean_stx)) continue;
$similarity = calculateOverallSimilarity($clean_stx, $term);
if ($similarity >= 0.5 && !hasParticleOrSuffix($term)) {
$related_with_scores[$term] = $similarity;
}
}
// 유사도 기준으로 내림차순 정렬
arsort($related_with_scores);
// 맞춤법 교정: 입력어가 후보 단어와 유사할 때 교정 단어를 제공
if (!$exact_match && !empty($related_with_scores)) {
foreach ($related_with_scores as $term => $score) {
if (mb_strlen($clean_stx, 'UTF-8') === mb_strlen($term, 'UTF-8') && $score >= 0.7) {
$correction = $term;
break;
}
}
if (!$correction && !empty($related_with_scores)) {
$correction = key($related_with_scores);
}
}
}
// 연관 검색어: 유사도가 높은 순으로 최대 5개 제공
$related = array_keys($related_with_scores);
$related = array_slice($related, 0, 5);
$result = [
'correction' => $correction,
'related' => $related
];
// 캐시 저장
$cache_data = '<?php return ' . var_export($result, true) . '; ?>';
file_put_contents($cache_file, $cache_data, LOCK_EX);
echo json_encode($result);
?></code></pre></div><p><br></p><style>.t2-media-block img, .t2-media-block iframe, .t2-media-block video {border-radius: 15px !important; border: none !important; margin: 0 auto !important;}.file-container {width: 360px; background: white; border-radius: 12px; border: 1px solid #4a4a4a; padding: 20px; display: flex; align-items: center; font-family: Roboto, Arial, sans-serif; margin: 10px 0;}.file-icon {width: 42px; height: 52px; border-radius: 6px; margin-right: 20px; position: relative; flex-shrink: 0; overflow: hidden;}.file-info {flex-grow: 1; min-width: 0;}.file-name {font-size: 17px; font-weight: 500; color: rgba(0,0,0,0.87); margin: 0 0 6px 0; white-space: nowrap; overflow: hidden; text-overflow: ellipsis;}.file-details {color: rgba(0,0,0,0.6); font-size: 14px; line-height: 1.5;}.file-details span {display: inline-block; margin-right: 12px;}.t2-table {width: 100%; border-collapse: collapse; margin: 15px 0; font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;}.t2-table th, .t2-table td {border: 1px solid #ccc; padding: 8px; vertical-align: top;}.t2-table th {background-color: #f5f5f5; font-weight: 500;}.table-responsive {display: block; width: 100%; overflow-x: auto; margin-bottom: 1rem; -webkit-overflow-scrolling: touch;}.t2-table.t2-table-large {min-width: 800px;}</style>