9:["$","$L18",null,{"formats":"$undefined","locale":"en","messages":{"meta":{"title":"Mdoo AI","description":"Free AI, deep learning & machine learning courses. Learn basic math, neural networks, backpropagation, KNN, regression, ensemble step by step with quizzes. AI for beginners—start here.","keywords":"deep learning, machine learning, AI course, free AI course, deep learning for beginners, machine learning tutorial, AI learning, neural network, backpropagation, KNN, linear regression, free course, AI education","learnTitle":"Learn","learnPageSeoTitle":"Basic Deep Learning | Learn","learnDescription":"Free basic deep learning course: dot product, matrix multiplication, linear layer, activation, backpropagation. Learn by chapter with problems and mini neural network playground.","learnKeywords":"basic deep learning, deep learning, dot product, matrix multiplication, neural network, backpropagation, linear layer, activation function, softmax, free deep learning course","learnMathTitle":"Basic Math and AI | Learn","learnMathDescription":"Free basic math for AI and deep learning: functions, vectors, matrices, exponents, logarithms, uniform and normal distributions. AI math foundations.","learnMathKeywords":"basic math, functions, vectors, matrices, AI math, normal distribution, deep learning math","learnMlTitle":"Basic Machine Learning | Learn","learnMlDescription":"Free basic machine learning course: KNN, linear and logistic regression, decision trees, ensemble, K-means, cross-validation, recommendation systems. ML for beginners.","learnMlKeywords":"basic machine learning, machine learning, KNN, linear regression, logistic regression, decision tree, ensemble, K-means, cross-validation, recommendation system, free ML course","learnMidMlTitle":"Intermediate Machine Learning | Learn","learnMidMlDescription":"Data preprocessing (scaling, encoding, imputation), PCA, SVM, boosting basics, DBSCAN, GMM, pipelines, and hyperparameter tuning for real-world ML.","learnMidMlKeywords":"intermediate ML, scaling, encoding, imputation, PCA, SVM, boosting, AdaBoost, GBM, DBSCAN, GMM, pipeline, Optuna","learnAdvDlTitle":"Advanced Deep Learning | Learn","learnAdvDlDescription":"Transformer, BERT, GPT, FlashAttention, ViT, self-supervised learning, prompt engineering, LoRA, QLoRA, RLHF, DPO, RAG, LLM agents, GNN, XAI, autoencoder, VAE, GAN, diffusion, VLM, speech, knowledge distillation, deployment: large models and generative AI in chapters.","learnAdvDlKeywords":"advanced deep learning, Transformer, BERT, GPT, FlashAttention, ViT, LoRA, QLoRA, RLHF, DPO, RAG, LLM agents, GNN, Grad-CAM, VAE, GAN, diffusion model, Stable Diffusion, CLIP, Whisper, knowledge distillation, TensorRT, vLLM","learnMidDlTitle":"Intermediate Deep Learning | Learn","learnMidDlDescription":"Weight initialization, Adam, learning rate scheduling, regularization, batch/layer norm, data augmentation, CNN, pooling, ResNet, efficient convolution, transfer learning, object detection, image segmentation, tokenization, word embedding, 1D CNN, RNN, LSTM, GRU, encoder-decoder, attention: stable training and unstructured data in chapters.","learnMidDlKeywords":"intermediate deep learning, weight initialization, Xavier, He, Adam, RMSprop, learning rate scheduling, regularization, dropout, batch norm, layer norm, data augmentation, CNN, pooling, ResNet, MobileNet, transfer learning, YOLO, SSD, U-Net, tokenization, BPE, Word2Vec, GloVe, RNN, LSTM, GRU, attention","learnMidMathTitle":"Intermediate Math | Learn","learnMidMathDescription":"Vectors, matrices, linear transformation, eigenvalues, gradient, Jacobian, Hessian, Taylor series, convex optimization, conditional probability, Bayes, covariance, multivariate normal, MLE, entropy, cross-entropy: intermediate math for multivariable and uncertainty, chapter by chapter.","learnMidMathKeywords":"intermediate math, vector space, dot product, matrix, linear transformation, inverse, determinant, rank, eigenvalue, eigenvector, gradient, Jacobian, Hessian, Taylor series, convex optimization, conditional probability, Bayes theorem, covariance, MLE, entropy, KL divergence","learnAdvMathTitle":"Advanced Math | Learn","learnAdvMathDescription":"SVD, tensor algebra, Lagrange, Markov, Monte Carlo, MCMC, EM, MAP, variational inference, Wasserstein, MDP, Fourier, graph Laplacian, SDE, Langevin, information geometry: advanced math for generative models and optimization, chapter by chapter.","learnAdvMathKeywords":"advanced math, SVD, pseudoinverse, tensor, Lagrange, KKT, Markov, Monte Carlo, MCMC, EM, MAP, variational inference, Wasserstein, MDP, Bellman, Fourier, graph Laplacian, SDE, Langevin, score matching, information geometry","learnAdvMlTitle":"Advanced Machine Learning | Learn","learnAdvMlDescription":"Feature engineering, PCA, t-SNE, SVM, kernels, boosting, XGBoost, imbalanced data, anomaly detection, DBSCAN, GMM, hyperparameter tuning, cross-validation, XAI, SHAP, time series, recommender systems: advanced ML for nonlinear problems, complex data, optimization, and interpretability.","learnAdvMlKeywords":"advanced machine learning, feature engineering, PCA, t-SNE, UMAP, SVM, kernel, boosting, XGBoost, LightGBM, SMOTE, anomaly detection, DBSCAN, GMM, hyperparameter, Optuna, XAI, SHAP, LIME, time series, ARIMA, Prophet, matrix factorization, FM","playgroundTitle":"Mini Neural Network Playground","playgroundDescription":"Draw and explore neural network structures in your browser.","communityTitle":"IT News","communityDescription":"Stay up to date with the latest AI and IT news and development trends. New posts are added regularly; find them via search.","communityKeywords":"IT news, AI news, artificial intelligence news, machine learning, deep learning, LLM, AI development trends, tech news, AI updates","studiesTitle":"Studies","studiesDescription":"Find deep learning study groups and learning resources.","curriculumTitle":"Book reading","curriculumDescription":"Create and share book-based learning roadmaps.","supportTitle":"Support & Contact","supportDescription":"How to use Mdoo AI, Chrome extension, and support for Learn and community.","privacyTitle":"Privacy Policy","privacyDescription":"How Mdoo AI collects, uses, and stores personal information.","termsTitle":"Terms of Service","termsDescription":"Terms of service for Mdoo AI.","refundTitle":"Refund Policy","refundDescription":"Refund policy for Learn paid subscription.","aboutTitle":"What is Mdoo AI?","aboutDescription":"An AI education platform built by an AI researcher. Learn basic math and deep learning step by step. Based on experience from K-League AI Competition 3rd place, Financial AI Challenge 22nd, and more."},"support":{"title":"Support & Contact","intro":"For how to use Mdoo AI (mdooai.com), error reports, and suggestions, please refer to the following.","serviceTitle":"Service introduction","serviceContent":"Mdoo AI is an education platform that helps you understand deep learning and AI from the ground up. It offers Learn (chapter-by-chapter visuals, some chapters paid subscription), Book reading (book-based roadmaps), Community (learning material sharing), and a Chrome extension (open the learning page in a new tab).","extensionTitle":"Chrome extension","extensionContent":"Clicking the toolbar icon opens the learning page (https://mdooai.com/learn) in a new tab. For installation or usage questions, contact us via this support page or the extension's Chrome Web Store listing.","extensionInstallCta":"Install from Chrome Web Store","contactTitle":"Contact us","contactContent":"For general inquiries, error reports, or suggestions, please use the contact option on mdooai.com or the published contact details. We will respond as soon as possible.","linksTitle":"Related links","learnLink":"Learn","privacyLink":"Privacy Policy","termsLink":"Terms of Service","refundLink":"Refund Policy","supportUrlLabel":"Support URL"},"about":{"title":"What is Mdoo AI?","intro1":"Hello. I'm Lee Jong-hyeon, an AI researcher. I majored in computer science and am currently in the Master's program in AI at Yonsei University, researching machine learning and deep learning.","intro2":"I have participated in various AI competitions and developed models used in industry. Through that, I learned one important lesson: while technique matters, what really determines the difference in performance is understanding the fundamentals. These days you can implement models quickly with vibe coding, but when performance doesn't meet expectations, analyzing the cause and improving is still not easy. Without an understanding of the mathematical foundations and AI principles, it's difficult to structurally identify where bottlenecks occur.","intro3":"","intro4Before":"So I developed and released this learning platform based on what I've studied and organized. If you'd like lectures or training, feel free to reach out at ","intro4After":" and I'll be glad to help.","curriculumTitle":"Curriculum","curriculumIntro":"The platform is structured as a step-by-step curriculum from basic math to core deep learning concepts.","part1Title":"📘 Part 1. Basic Math and AI","part1Ch0":"Ch.00 Basic Math and AI: Learning the Language of AI","part1Ch1":"Ch.01 Functions: AI's Basic Unit of Input and Output","part1Ch2":"Ch.02 Exponents and Exponential Functions: The Math of Growth and Activation","part1Ch3":"Ch.03 Logarithm: Turning Multiplication into Addition, Designing Loss","part1Ch4":"Ch.04 Limits and ε-δ: Defining 'Approaching Infinitely Close'","part1Ch5":"Ch.05 Continuity: Smooth Curves, Opening the Door to Calculus","part1Ch6":"Ch.06 Derivatives: Instantaneous Slope, the Compass of Learning","part1Ch7":"Ch.07 Chain Rule: Unraveling Nested Functions, the Core of Backpropagation","part1Ch8":"Ch.08 Partial Derivatives and Gradient: Multi-Variable World, Direction of Gradient Descent","part1Ch9":"Ch.09 Integration: Area and Accumulation, the Bridge to Probability","part1Ch10":"Ch.10 Random Variables and Distributions: Capturing Uncertainty in Numbers","part1Ch11":"Ch.11 Mean and Variance: Center and Spread of Distributions","part1Ch12":"Ch.12 Uniform and Normal Distributions: From Initialization to Prediction","part2Title":"📗 Part 2. Understanding Deep Learning Structure","part2Ch0":"Ch.00 First Steps in Deep Learning: How Does AI Think?","part2Ch1":"Ch.01 Dot Product: Finding Similarity in Data","part2Ch2":"Ch.02 Matrix Multiplication: The Magic of Batch Computation","part2Ch3":"Ch.03 Linear Layer: Weights That Decide Importance","part2Ch4":"Ch.04 Activation Functions: Adding Judgment to AI","part2Ch5":"Ch.05 Artificial Neuron: The Unit That Gathers Information and Sends Signals","part2Ch6":"Ch.06 Batch Processing: Learning in Batches","part2Ch7":"Ch.07 Weight Connections: The Chains That Build Intelligence","part2Ch8":"Ch.08 Hidden Layers: The Invisible Depth of Thought","part2Ch9":"Ch.09 Deep Neural Networks: The Power to Solve More Complex Problems","part2Ch10":"Ch.10 Width and Neurons: Finding More Features at Once","part2Ch11":"Ch.11 Softmax: Turning Results into Confidence","part2Ch12":"Ch.12 Gradient and Backpropagation: Learning from Mistakes","part2Ch13":"Ch.13 Summary: A Map of AI at a Glance","curriculumNote":"Rather than simple concept summaries, the content follows the flow of computation step by step so you understand 'why it works this way.' It's visualization- and interaction-focused.","approachTitle":"Learning Approach","approachContent":"Rather than listing concept summaries, the content follows the flow of computation step by step so you understand 'why it works this way.' It's centered on visualization and interaction, with immediate AI coach feedback to correct misconceptions.","roadmapTitle":"Future Plans","roadmapContent":"We plan to continuously expand with more AI education content, including machine learning. If you're interested, feel free to contact us at ","roadmapContactAfter":" anytime.","feedbackNote":"It's still an early version, but we're improving it continuously. Your feedback is welcome and will be actively incorporated.","ctaLearn":"Start Learning","ctaDeveloper":"View Developer Profile","chromeExtensionTitle":"Add to Chrome Web Store","chromeExtensionDesc":"Install the Chrome extension to open the learning page in a new tab."},"terms":{"title":"Terms of Service","effectiveDate":"Effective: March 2, 2026 (updates will be announced on this page).","intro":"Thank you for using Mdoo AI (mdooai.com). These terms apply to your use of our services.","section1Title":"1. Applicable scope","section1Content":"These terms apply to the Mdoo AI website and related services (Learn, Book reading, Community, etc.). Only Learn has paid subscription content (some chapters). By using the service, you agree to these terms.","section2Title":"2. Service use","section2Content":"You may use the service after signing up or logging in. Learn has free and paid subscription chapters; Book reading, Community, and other services are free. Payment and refund terms for Learn are on the relevant policy pages.","section3Title":"3. Prohibited acts","section3Content":"You may not misuse accounts, disrupt the service, violate laws, or copy content for commercial use without permission. Violations may result in restricted access.","section4Title":"4. Terms changes","section4Content":"We may update these terms and will announce changes on this page. Significant changes will state an effective date. Continued use after changes constitutes acceptance.","section5Title":"5. Inquiries","section5Content":"For questions about these terms or the service, please contact us via mdooai.com or the support/contact options on the site.","termsUrlLabel":"Terms of Service URL"},"refund":{"title":"Refund Policy","effectiveDate":"Effective: March 2, 2026 (updates will be announced on this page).","intro":"Mdoo AI Learn (paid subscription) is billed monthly at 4 USD per month. This policy covers payment and refunds.","section1Title":"1. Subscription fee and payment","section1Content":"The Learn paid subscription fee is 4 USD per month and is automatically renewed and charged each month from your payment date. Payment is processed by Paddle or another payment provider; you will be charged according to the amount, currency, and billing cycle shown at checkout.","section2Title":"2. Refunds","section2Content":"If you are not satisfied with the service, you may request a full refund within 7 days of your first payment date. After 7 days or from the second payment onward, no refund is given for the current month’s period. Request refunds via the site’s support/contact or Paddle customer support.","section3Title":"3. Cancellation","section3Content":"You may cancel your subscription at any time. After cancellation, access to paid Learn chapters continues until the end of the current billing period; you will not be charged from the next billing date. No refund is given for the current month already paid.","section4Title":"4. Applicability and contact","section4Content":"Refund and cancellation procedures follow the policy in effect at the time of payment and Paddle’s policy. For refund, payment, or cancellation questions, use the mdooai.com support page or Paddle customer support.","refundUrlLabel":"Refund Policy URL"},"privacy":{"title":"Privacy Policy","effectiveDate":"Effective: March 2, 2026 (updates will be announced on this page).","section1Title":"1. Scope","section1Content":"This Privacy Policy applies to the Mdoo AI (mdooai.com) website and related services (Learn, Book reading, Community, Chrome extension, etc.). Only Learn has paid subscription chapters.","section2Title":"2. Information we collect","section2Intro":"The following information may be collected and used when you use our services.","section2List1":"Account information: email, password, display name, etc. when you sign up or log in.","section2List2":"Usage data: learning progress, community posts and comments, book reading roadmaps, etc.","section2List3":"Device and environment: browser, access logs, etc. (for service improvement and security).","section2List4":"Learn payment and subscription: payment is processed by Paddle or another payment provider; we do not store card details. Learn subscription and purchase information may be used to provide access to paid chapters and handle refunds or cancellation.","section2Extension":"The Chrome extension does not collect or transmit user data. It only opens the learning page in a new tab when you click the icon.","section3Title":"3. How we use the information","section3Content":"Collected information is used to provide and improve the service, respond to inquiries, ensure security and prevent abuse, and comply with applicable laws.","section4Title":"4. Retention and deletion","section4Content":"Personal information is securely deleted after the purpose of use is fulfilled or after the legally required retention period. We also delete data in accordance with our procedures when you request deletion or account closure.","section5Title":"5. Third-party sharing","section5Content":"We do not sell or provide your personal information to third parties without your consent. We may share information only when required by law or with your consent.","section6Title":"6. Policy changes","section6Content":"We will update this page when the Privacy Policy changes. Significant changes will be announced with an effective date.","section7Title":"7. Contact","section7Content":"For questions about how we handle personal information, please contact us via mdooai.com or the contact option on the site.","privacyUrlLabel":"Privacy Policy URL"},"common":{"appName":"Mdoo AI","headerBrand":"I am Doo AI","loading":"Loading...","close":"Close","back":"Back","backToHome":"← Home","chapterSelect":"Select chapter","chapterSearchNoResults":"No results found.","chapterListEmpty":"No chapters.","chapters":"Learn","curriculum":"Book reading","community":"Community","itNews":"IT News","language":"Language","openMenu":"Open menu","closeMenu":"Close menu","menu":"Menu","communityComingSoon":"Community section is coming soon.","searchPlaceholder":"Search chapters, concepts…","globalSearchPlaceholder":"Search all chapters…","globalSearchNoResults":"No results found.","answer":"Answer","wrongAnswerGuideButton":"Why was it wrong?","signIn":"Sign in","signUp":"Sign up","myAccount":"My Account","signOut":"Sign out","aboutLink":"What is Mdoo AI?","myAchievements":"My achievements","moreServices":"More","allServices":"All services"},"community":{"title":"IT News","subtitle":"Stay up to date with the latest AI and IT news and development trends.","allPosts":"All posts","viewFullCommunity":"View full community","sortNewest":"Newest","sortOldest":"Oldest","newPost":"New post","createPost":"Create post","uploadMaterial":"Upload material","uploadTitle":"Title","category":"Category","categoryAll":"All","categoryPlaceholder":"Select category","category_ai_news":"AI News","category_ai_basics":"AI Basics","category_machine_learning":"Machine Learning","category_deep_learning":"Deep Learning","category_nlp":"Natural Language Processing","category_computer_vision":"Computer Vision","category_llm":"Large Language Models","category_prompt_engineering":"Prompt Engineering","category_ai_ethics":"AI Ethics","category_ai_tools":"AI Tools","category_study_material":"Study Materials","priceTypeFree":"Free","priceTypePaid":"Paid","price":"Price","pricePlaceholder":"e.g. 10,000 KRW","uploadTitlePlaceholder":"e.g. Dot product practice sheet","uploadDescription":"Description","uploadDescriptionPlaceholder":"Describe the material and how to use it...","uploadFile":"Attach file (optional)","uploadSubmit":"Publish","uploading":"Publishing...","download":"Download","postedAt":"posted","noPosts":"No posts yet. Be the first to share!","searchPlaceholder":"Search title or description","prevPage":"Previous","nextPage":"Next","pageOf":"Page {current} of {total}","scrollToTop":"Scroll to top","signInToPost":"Sign in to upload materials.","errorLoad":"Failed to load posts.","errorPublish":"Failed to publish. Try again.","errorPriceRequired":"Please enter the price for paid posts.","backToFeed":"Back to feed","postedAnUpdate":"posted an update","postLabel":"Post","inThisPost":"In this post","replyPlaceholder":"Reply to {name}'s post","replyComingSoon":"Replies are coming soon.","errorPostNotFound":"Post not found.","deletePost":"Delete post","deleteConfirm":"Delete this post?","errorDelete":"Failed to delete.","editPost":"Edit post","comments":"Comments","commentPlaceholder":"Write a comment","commentSubmit":"Post","commentSubmitting":"Posting…","commentEdit":"Edit","commentDelete":"Delete","commentDeleteConfirm":"Delete this comment?","commentCancel":"Cancel","commentSave":"Save","noComments":"No comments yet.","errorComment":"Failed to post comment.","errorCommentEdit":"Failed to update.","errorCommentDelete":"Failed to delete.","removeFile":"Remove","editForbidden":"You don't have permission to edit.","backToPost":"Back to post","currentFile":"Current","removeFileLabel":"Remove attachment"},"curriculum":{"title":"Book reading","listTitle":"Book reading","listSubtitle":"Create and share book-based learning roadmaps. Browse recommended book reading.","createNew":"New book reading","newTitle":"Create book reading","subtitle":"Search for a textbook and get a learning roadmap so you can follow the track to reach your learning goal.","searchBooks":"Search books","autocompleteLabel":"Autocomplete","searchResults":"Select from search results","searchResultsEmpty":"Search for books to see results here.","requiredBookTitle":"Please enter the book title. (Required)","aiAutoLabel":"AI auto-generate","generateHint":"After entering the book title, click the button and AI will generate a learning roadmap.","generateWithAI":"Generate book reading with AI","fillRequiredToGenerate":"Enter a book title to enable this button.","resultEmptyHint":"Click \"Generate book reading with AI\" above to fill this area. You can edit and save.","requiredToSave":"Please enter both book title and book reading content to save.","searchPlaceholder":"Search by book title, author, or topic…","searchButton":"Search","searching":"Searching…","noBooks":"No results. Try a different search.","selectBook":"Create book reading from this book","editBookInfo":"Book info (editable)","searchOrManualHint":"Search for a book to select it, or enter the details below. You can create a book reading with just a title if the book is not in the catalog.","bookTitle":"Book title","bookTitlePlaceholder":"e.g. Introduction to Deep Learning","bookImageUrl":"Book cover image URL","isbnPubdate":"ISBN / Publication date","bookInfo":"Book information","bookDescription":"Book description","isbn":"ISBN","pubdate":"Publication date","generating":"Generating book reading…","generateError":"Failed to generate book reading. Please try again.","searchError":"Book search failed.","optionalRequest":"Additional request (optional)","optionalRequestPlaceholder":"e.g. For beginners, 2-week course, focus on understanding ML…","resultTitle":"Generated learning roadmap","shortDescription":"Short description (shown in list)","shortDescriptionPlaceholder":"e.g. Step-by-step learning roadmap from basics to advanced","shortDescriptionHint":"Shown as preview on the list. Leave empty to use content summary.","editCurriculum":"Edit the content below if needed, then save.","save":"Save","saving":"Saving…","saveSuccess":"Saved.","saveError":"Failed to save.","signInToSave":"Sign in to save.","author":"Author","publisher":"Publisher","sortNewest":"Newest","sortOldest":"Oldest","sortPopular":"Popular","curriculaSearchPlaceholder":"Search title or summary","prevPage":"Previous","nextPage":"Next","pageOf":"Page {current} of {total}","scrollToTop":"Scroll to top","noCurricula":"No saved book reading yet. Create one!","notFound":"Book reading not found.","like":"Recommend","likes":"Recommends","createdBy":"Created by","anonymous":"Anonymous","edit":"Edit","delete":"Delete","deleteConfirm":"Delete this book reading?","editCurriculumMenu":"Menu","editTitle":"Edit book reading","cancel":"Cancel","backToCurriculum":"Back to book reading","backToDetail":"Back to detail","editForbidden":"Only the author can edit."},"auth":{"loading":"Loading...","signIn":{"title":"Sign in","subtitle":"Enter your email or username and password.","identifierLabel":"Email or username","identifierPlaceholder":"Enter email or username","passwordLabel":"Password","passwordPlaceholder":"Enter password","submit":"Continue","submitting":"Signing in...","noAccount":"Don't have an account?","signUpLink":"Sign up"},"signUp":{"title":"Create your account","subtitle":"Please fill in the details below to get started.","usernameLabel":"Username","usernamePlaceholder":"4–64 characters, letters and numbers","usernameRules":"4–64 characters, Latin letters only. Special characters ^ $ ! . ` # + ~ are not allowed.","emailLabel":"Email address","emailPlaceholder":"Enter your email address","passwordLabel":"Password","passwordPlaceholder":"Enter your password","submit":"Continue","submitting":"Processing...","hasAccount":"Already have an account?","signInLink":"Sign in"},"verifyEmail":{"title":"Verify your email","subtitleSignIn":"Enter the verification code sent to your email.","subtitleSignUp":"Enter the verification code sent to your email address.","codeLabel":"Verification code","codePlaceholder":"Enter verification code","submit":"Verify","submitting":"Verifying...","verifyButton":"Verify","back":"Back","backSignIn":"Sign in another way"},"errors":{"generic":"Something went wrong. Please try again.","username_length":"Username must be between 4 and 64 characters.","username_non_number":"Username must contain at least one non-numeric character (e.g. a letter).","username_latin_only":"Usernames can only use Latin letters (e.g. English). You can set a display name in your preferred language after sign-up.","password_length":"Please check the password length requirements.","form_identifier_exists":"This email or username is already in use.","form_identifier_not_found":"No account found with this identifier.","form_password_incorrect":"Incorrect password.","form_code_incorrect":"Invalid verification code.","form_password_compromised":"A security issue was detected with your password. Please sign in using another method, such as email verification.","user_locked":"Sign-in is temporarily locked. Please try again later.","display_name_min_length":"Display name must be at least 4 characters."}},"landing":{"heroTitle":"Where you learn AI the easy way","heroSubtext":"Learn step by step, the right way.","heroTagline":"The place where everyone learns AI.","forEveryone":"The platform for learning AI from the ground up—concepts, computation, and instant feedback.","heroCurriculum":"Create and share book-based learning roadmaps with other learners.","heroCommunity":"Share and download AI learning materials in the community.","ctaAbout":"What is Mdoo AI?","ctaExplore":"Start deep learning","ctaMath":"Start math","ctaMl":"Start machine learning","ctaBrowse":"Browse book reading","ctaBrowseCommunity":"Browse community","trendingLabel":"Quick access","homeOfTitle":"The Home of AI Learning","homeOfSubtitle":"Discover step by step, practice hands-on, and learn with AI feedback.","featurePlatformTitle":"Learning platform","featurePlatformDesc":"Learn foundations, deep learning, and machine learning chapter by chapter, with no limits.","featureFasterTitle":"Move faster","featureFasterDesc":"Concepts, practice problems, and instant AI feedback to level up.","featureExploreTitle":"Explore all levels","featureExploreDesc":"Foundations, deep learning, and ML—step by step. We improve continuously with your feedback.","featureBadgeTitle":"Achievements & certificate","featureBadgeDesc":"Complete chapters to earn achievements and receive a certificate of completion.","featurePortfolioTitle":"Grow together","featurePortfolioDesc":"Share your learning, get the latest development news, and connect with fellow learners.","signUpCta":"Sign Up","problemTitle":"Why you need to do the math yourself","problemBody":"If you only use APIs, it's hard to explain why a model produced a given result.\n\nDot products, matrix multiplication, gradients—unless you work through these calculations yourself, it's difficult to grasp why performance dropped or where things went wrong.\n\nMost courses show only results and formulas, and don't give you enough opportunity to check the computation step by step.","solutionSectionLabel":"How it works","solutionTitle":"Learn concepts easily and solve problems. When stuck, just ask the AI","solutionIntro":"From dot product to gradient—core deep learning math, structured across 12 chapters.","solutionList":"Every chapter has concept overviews and practice problems. When wrong or stuck, you can ask the AI.","solutionBody":"When you're curious or got it wrong, you can ask the AI coach.","ctaStartLearning":"Start learning deep learning","globalPlatform":"KO · EN · JA · ZH","learnShortDesc":"Basic deep learning: 12 chapters from dot product to gradient—concepts, problems, and instant grading.","heroImageAlt":"AI learning background","dlCardTitle":"Basic Deep Learning","advMathCardTitle":"Advanced Math","learnAdvMathShortDesc":"SVD, tensors, Markov, MCMC, variational inference, Wasserstein, SDE, information geometry. Advanced math for generative models and optimization, chapter by chapter.","ctaAdvMath":"Advanced Math","advMlCardTitle":"Advanced Machine Learning","learnAdvMlShortDesc":"Feature engineering, PCA, SVM, boosting, XGBoost, imbalanced data, anomaly detection, DBSCAN, XAI, SHAP, time series, recommender systems. Chapter-by-chapter advanced ML.","ctaAdvMl":"Advanced ML","mlCardTitle":"Basic Machine Learning","learnMlShortDesc":"From data and features to KNN, linear and logistic regression, and recommendation systems. Learn basic machine learning chapter by chapter.","midDlCardTitle":"Intermediate deep learning","learnMidDlShortDesc":"Weight init, Adam, regularization, CNN, ResNet, transfer learning, object detection, tokenization, RNN, LSTM, attention. Stable training and unstructured data in chapters.","ctaMidDl":"Intermediate deep learning","advDlCardTitle":"Advanced deep learning","learnAdvDlShortDesc":"Transformer, BERT, GPT, LoRA, QLoRA, RLHF, RAG, agents, GAN, diffusion, VLM, knowledge distillation, deployment. Large models and generative AI in chapters.","ctaAdvDl":"Advanced deep learning","learnMathShortDesc":"From functions, vectors, and matrices to uniform and normal distributions. Build the foundations for understanding AI.","mathCardTitle":"Basic math","midMathCardTitle":"Intermediate math","learnMidMathShortDesc":"Vectors, matrices, linear transformation, eigenvalues, gradient, Jacobian, Hessian, convex optimization, Bayes, MLE, entropy. Learn multivariable and uncertainty math chapter by chapter.","ctaMidMath":"Intermediate math","quickAccessTitle":"Math · Deep learning · Machine learning","curriculumShortDesc":"Design your own book-based learning roadmap and grow together with other learners.","communityShortDesc":"Share AI and deep learning materials, get the latest development news, and connect with fellow learners.","itNews":"IT News","itNewsShortDesc":"Stay up to date with the latest AI and IT news and development trends.","coupangBannerText":"Discover a wide range of products on Coupang"},"home":{"introButton":"About the service","intro":"An AI-powered learning platform that helps beginners not get stuck on concepts and formulas. Practice calculations, get feedback from an AI coach to fix misconceptions, and understand step by step how AI learns and reasons.","problem":"Problem","problemPrompt":"Find the dot product __DOT_FORMULA__ of the vectors below.","problemPromptMatrix":"Find the value that goes in the blank (?) in the matrix product __MATRIX_AB__ below.","problemPromptLinearLayer":"Find the value that goes in the blank (?) in the linear layer __LINEAR_FORMULA__ below.","problemPromptActivation":"Given the activation function (Sigmoid, ReLU, Tanh₃), find Y for each X and fill in the blank (?).","problemPromptArtificialNeuron":"Artificial neuron: apply the given activation (ReLU, Sigmoid, or Tanh) to get Y, and fill in the blank (?).","problemPromptBatch":"Fill in the blank (?) in the batch operation (weight times input plus bias, add, subtract, multiply, subtract mean, sum, or mean).","prev":"Previous","next":"Next","inputSectionTitle":"Your solution","askSectionTitle":"Ask a question","practicePadTitle":"Practice pad","fabMenuLabel":"Ask menu","practicePadSeeMain":"Solve the problem on the main screen.","drawMode":"Handwrite","keyboardMode":"Type","drawHint":"Draw your solution in the area below. After drawing, click \"Grade with AI\" to get feedback.","keyboardHint":"Enter your solution or answer below. After entering, click \"Grade with AI\" to get feedback.","askDrawHint":"Write your question here by hand. After writing, click \"Ask\" to get an answer.","askKeyboardHint":"Type your question here. Click \"Ask\" to get an answer.","askPlaceholder":"e.g. Why does this formula work like this?","askSubmit":"Ask","asking":"Sending...","askResponseTitle":"Answer","drawQuestionLabel":"(Question with drawing)","askEmptyAlert":"Please draw or type your question, then click Ask.","errorAsk":"An error occurred while sending your question. Please try again.","errorAskRequest":"Ask request failed","askErrorEmptyQuestion":"Please draw or type your question.","solutionErrorNoContent":"Could not generate the solution.","solutionErrorServer":"An error occurred while generating the solution.","ariaAskInput":"Type your question","placeholder":"Enter your steps or final answer. e.g. a·b = 3·5 = 15","ariaKeyboardInput":"Type your solution","clear":"Clear","grade":"Grade with AI","gradeShort":"Grade","grading":"Grading...","correctAnswer":"Correct!","wrongAnswer":"Incorrect. Please try again.","tryAgain":"Try again.","checkAnswer":"Check answer","chapterCompleteTitle":"Chapter complete!","chapterCompleteBadge":"{chapterName} achievement earned","chapterCompleteLoginHint":"If you sign in now, this chapter will be marked complete and you won't need to solve it again.","chapterCompleteSignInCta":"Sign in and save completion","chapterCompleteTryAgain":"Try again","chapterCompleteNextChapter":"Next chapter","badgeSaved":"Achievement saved.","certificateTitle":"Certificate of Completion","certificateSubtitlePrefix":"This is to certify that the person named below has completed the following courses of Mdoo AI (https://mdooai.com) ","certificateSubtitleEnd":"Learn.","certificateHolder":"Holder","certificateHolderEditHint":"You can type the name directly.","certificateHolderModalTitle":"Enter the recipient name","certificateHolderModalConfirm":"Confirm","certificateHolderModalPrint":"Print","certificateHolderEdit":"Edit","certificateCompleted":"Completed courses","certificateIssuer":"Issuer","certificateIssuerName":"Mdoo AI","certificateIssuerUrl":"https://mdooai.com","certificateDate":"Date issued","certificatePrint":"Print certificate","certificateNoBadges":"No completed chapters yet. Complete chapters to receive a certificate.","certificateSignInRequired":"Please sign in to issue a certificate.","certificateIssue":"Issue certificate","profileTitle":"My learning","profileBadgesSection":"Earned achievements","profileNoBadges":"No completed chapters yet.","profileCertificateLink":"Issue certificate","profileMyBadges":"My achievements","profileBadgesCta":"View my achievements / Issue certificate","badgesPageTitle":"My Achievements & Certificate","badgesPageDesc":"View your earned achievements and certificate of completion.","badgesAdminMode":"(Admin Preview)","badgesAdminModeDesc":"All achievements are shown and the full certificate is printed.","mathFunctionsProblemPrompt":"Given f(x) = ax + b, fill in the blank (?).","mathFunctionsProblemPromptInput":"Given f(?) = value, find x and fill in the blank.","mathFunctionsProblemPromptCompare":"Choose the larger value and enter 1 or 2.","mlKnnProblemPrompt":"Read the instruction below, find the answer, then enter it in the blank (?).","mlLinearRegressionProblemPrompt":"Read the instruction below, find the answer, then enter it in the blank (?).","mlLinearRegressionProblemPromptPredict":"For the linear regression model $\\hat{y} = w x + b$ with $w={w}$, $b={b}$, find the predicted value $\\hat{y}$ when $x={x}$. Enter an integer.","mlLinearRegressionProblemPromptSlope":"Find the slope $w = \\frac{y_2-y_1}{x_2-x_1}$ of the line through ({x1}, {y1}) and ({x2}, {y2}). Enter an integer.","mlLinearRegressionProblemPromptIntercept":"A line with slope $w={w}$ passes through ({x}, {y}). Find the intercept $b = y - w x$. Enter an integer.","mlLinearRegressionProblemPromptTwoPointPredict":"The line through ({x1}, {y1}) and ({x2}, {y2}) is given. Find the $y$ value on the line when $x={x}$. Enter an integer.","mlLinearRegressionProblemPromptResidual":"The line $\\hat{y}={w}x+{b}$ predicts values. The actual observation is at ({x}, {y}). Find the residual $y - \\hat{y}$. Enter an integer.","mlLinearRegressionProblemPromptResidualSum":"Points {points} and line $\\hat{y}={w}x+{b}$. Find the sum of residuals $\\sum_i (y_i - \\hat{y}_i)$. Enter an integer.","mlMseProblemPrompt":"Read the instructions below, find the answer, and enter it in the blank (?).","mlMseProblemPromptSquaredError":"When actual $y={y}$ and prediction $\\hat{y}={yHat}$, find the squared error $(y - \\hat{y})^2$. Enter an integer.","mlMseProblemPromptSse":"For the following (actual, prediction) pairs, find the sum of squared errors $\\sum_i (y_i - \\hat{y}_i)^2$. {pairs} Enter an integer.","mlMseProblemPromptMse":"For the following (actual, prediction) pairs, find the mean squared error MSE $= \\frac{1}{n}\\sum_i (y_i - \\hat{y}_i)^2$. {pairs} Enter an integer.","mlMseProblemPromptMseFromLine":"Points {points} and line $\\hat{y}={w}x+{b}$. Find the MSE. Enter an integer.","mlMseProblemPromptMissingSquaredError":"MSE $= {mse}$, $n = {n}$, and $n-1$ squared errors are {squaredErrors}. Find the remaining squared error. Enter an integer.","mlMseProblemPromptRmse":"When MSE $= {mse}$, find RMSE $= \\sqrt{\\text{MSE}}$. Enter an integer.","mlMseProblemSolvingTable":"$19","mlLogisticProblemPrompt":"Read the instructions below, find the answer, and enter it in the blank (?).","mlLogisticProblemPromptLinearScore":"In the linear score $z = wx + b$ of logistic regression, when $w={w}$, $x={x}$, $b={b}$, find $z$ as an integer.","mlLogisticProblemPromptMultiScore":"In the linear score $z = w_1 x_1 + w_2 x_2 + b$, when weights are {weights}, features are {features}, and $b={b}$, find $z$ as an integer.","mlLogisticProblemPromptClassifyFromZ":"When the linear score $z = {z}$, according to the decision boundary ($z>0 \\Rightarrow \\hat{y}=1$, $z \\le 0 \\Rightarrow \\hat{y}=0$), find the predicted class $\\hat{y}$. (0 or 1)","mlLogisticProblemPromptClassifyFromProb":"When probability $p = {p}$ and threshold $= {threshold}$, if $p \\ge$ threshold then $\\hat{y}=1$, otherwise $\\hat{y}=0$. Find the predicted class $\\hat{y}$. (0 or 1)","mlLogisticProblemPromptCountClassOne":"For the following linear scores, we classify as class 1 when $z>0$. Find the number of samples classified as class 1 as an integer. $z$ list: {zList}","mlLogisticProblemPromptCountMisclassified":"When the true labels are {labels} and the linear score $z$ for each sample is {zList}, we predict $\\hat{y}_i = 1$ if $z_i>0$ else $0$. Find the number of misclassified samples.","mlLogisticProblemSolvingTable":"$1a","mlDecisionTreeProblemPrompt":"Read the instruction below, find the answer, then enter it in the blank (?).","mlDecisionTreeProblemPromptCountNodes":"In a decision tree there are {internal} internal nodes and {leaves} leaf nodes. Find the total number of nodes.","mlDecisionTreeProblemPromptCountLeaves":"In a decision tree there are {leaves} leaf nodes. Find the number of leaf nodes.","mlDecisionTreeProblemPromptTreeDepth":"The maximum depth of the tree (root = 0) is {depth}. Find the depth.","mlDecisionTreeProblemPromptFollowPath":"In the decision tree, the path is {path} (0 = no/left, 1 = yes/right). Find the predicted class (0 or 1) at the leaf you reach.","mlDecisionTreeProblemPromptLeafMajority":"At one leaf, class 0 has {c0} samples and class 1 has {c1}. Find the predicted class (0 or 1) by majority vote.","mlDecisionTreeProblemPromptGini":"When class counts are {counts}, compute Gini impurity $G = 1 - \\sum_i p_i^2$ and find the rounded integer for $100 \\times G$.","mlDecisionTreeProblemPromptEntropy":"When class counts are {counts}, compute entropy $H = -\\sum_i p_i \\log_2 p_i$ and find the rounded integer for $100 \\times H$.","mlDecisionTreeProblemPromptInformationGain":"Parent node class counts {parentCounts}, left child {leftCounts}, right child {rightCounts}. Find the rounded integer for $100 \\times \\text{IG}$ (information gain).","mlDecisionTreeProblemPromptWeightedGini":"After split: left child class counts {leftCounts}, right child {rightCounts}. Find the rounded integer for $100 \\times$ weighted Gini $(n_L/n)G_L + (n_R/n)G_R$.","mlDecisionTreeProblemSolvingTable":"**Decision tree — solving guide**\n\n| Type | How to solve | Answer format |\n| :--- | :--- | :--- |\n| **Node count** | Internal nodes + leaf nodes. | Integer |\n| **Leaf count** | The number given as leaf count. | Integer |\n| **Depth** | Maximum depth (root = 0). | Integer |\n| **Follow path** | Start at root; 0 = left, 1 = right; the leaf’s prediction is the answer. | 0 or 1 |\n| **Gini** | Get $p_i$ from class counts, $G = 1 - \\sum_i p_i^2$, round $100 \\times G$. | Integer |\n| **Entropy** | $H = -\\sum_i p_i \\log_2 p_i$, round $100 \\times H$. | Integer |\n| **Weighted Gini** | $(n_L/n)G_L + (n_R/n)G_R$, round $100 \\times$. | Integer |\n| **Leaf majority** | Class 0: $a$, class 1: $b$; predict 0 if $a \\ge b$, else 1. | 0 or 1 |","mlEnsembleProblemPrompt":"Read the instruction below, find the answer, then enter it in the blank (?).","mlEnsembleProblemSolvingLabel":"Explanation for solving the problems","mlEnsembleProblemPromptMajorityVote":"In a random forest, class 0 received {votes0} votes and class 1 received {votes1} votes. Find the final predicted class (0 or 1) by majority vote.","mlEnsembleProblemPromptCountVotes":"There are {totalTrees} trees; class 0 has {votes0} votes and class 1 has {votes1} votes. Find the number of votes for the winning class.","mlEnsembleProblemPromptRegressionMean":"In a regression ensemble, {B} trees predicted {predictions}. Find the mean $\\hat{y} = \\frac{1}{B}\\sum_{b=1}^B \\hat{y}_b$, then round to an integer.","mlEnsembleProblemPromptNumTrees":"In a random forest there are {B} trees. Find the number of trees $B$.","mlEnsembleProblemPromptOobCount":"There are {nTrees} trees, and a sample was in the bootstrap of only {nInBag} of them. Find the number of trees that did not use this sample (OOB count).","mlEnsembleProblemPromptFormulaMean":"In an ensemble, {B} trees have predictions summing to {sum}. Find the mean $\\hat{y} = \\frac{1}{B}\\sum_{b=1}^B \\hat{y}_b$ and round to an integer.","mlEnsembleProblemPromptDefinition":"If the following statement is true enter 1, otherwise 0. {statement}","mlEnsembleProblemPromptFeatureImportance":"Feature importances are {importances}. Find the index (starting from 1) of the feature with the highest importance.","mlEnsembleProblemPromptWeightedVote":"There are 2 trees: the first gives class {c1} weight {w1}, the second gives class {c2} weight {w2}. Find the final prediction (0 or 1) by the class with the larger weight.","mlEnsembleStatement_0":"In bagging, each base model is trained independently.","mlEnsembleStatement_1":"Random forest is an ensemble that combines bagging and decision trees.","mlEnsembleStatement_2":"In classification ensembles, the final prediction is usually by majority vote.","mlEnsembleStatement_3":"In boosting, later models focus on samples that previous models got wrong.","mlEnsembleStatement_4":"OOB (Out-of-Bag) means predicting a sample using only trees that did not have it in their bootstrap sample.","mlEnsembleStatement_5":"In stacking, a meta-model uses the predictions of base models as input.","mlEnsembleStatement_6":"In regression ensembles, the final prediction is usually the average of tree predictions.","mlEnsembleStatement_7":"In random forest, at each split only a random subset of features is considered.","mlEnsembleStatement_8":"An ensemble combines predictions from multiple models into one prediction.","mlEnsembleStatement_9":"Random forest tends to reduce variance compared to a single decision tree.","mlEnsembleStatement_10":"In boosting, each base model is trained independently.","mlEnsembleStatement_11":"In regression ensembles, the final prediction is by majority vote.","mlEnsembleStatement_12":"OOB evaluation requires a separate validation set.","mlEnsembleStatement_13":"In random forest, each tree is trained on the full training data.","mlEnsembleStatement_14":"In stacking, the meta-model uses only the original input features of the base models.","mlEnsembleProblemSolvingTable":"**Ensemble problem-solving guide**\n\n| Type | How to solve | Answer format |\n| :--- | :--- | :--- |\n| **Majority vote** | Compare votes for class 0 vs class 1; the majority is the final prediction. If tied, use 0. | 0 or 1 |\n| **Vote count** | Number of votes for the winning class. | Integer |\n| **Regression mean** | Sum of predictions divided by number of trees; round if needed. | Integer |\n| **Number of trees** | The $B$ given in the problem. | Integer |\n| **OOB count** | Total trees minus trees that had this sample in their bootstrap. | Integer |\n| **Formula mean** | Sum ÷ $B$, then round. | Integer |\n| **Definition** | True → 1, false → 0. | 0 or 1 |\n| **Feature importance** | Index (from 1) of the feature with the largest importance. | Integer |\n| **Weighted vote** | The class with the larger weight is the final prediction. | 0 or 1 |","mathExponentialProblemPrompt":"Find the value of the exponential and fill in the blank (?).","mathExponentialProblemPromptExponent":"Find the exponent (?) and fill in the blank.","mathExponentialProblemPromptCompare":"Choose the larger one and enter 1 or 2.","mathExponentialProblemPromptProduct":"Same base product: find the exponent sum (?).","mathExponentialProblemPromptQuotient":"Same base quotient: find the exponent difference (?).","mathExponentialProblemPromptPowerOfPower":"Find the value of the power of a power.","mathLogProblemPrompt":"Find the value of the logarithm and fill in the blank (?).","mathLogProblemPromptInput":"Find the argument (?) and fill in the blank.","mathLogProblemPromptCompare":"Choose the larger value and enter 1 or 2.","mathLogProblemPromptSum":"Log sum: $\\log_a(b) + \\log_a(c) = \\log_a(b \\cdot c)$. Fill in the blank (?).","mathLogProblemPromptDiff":"Log difference: $\\log_a(b) - \\log_a(c) = \\log_a(b/c)$. Fill in the blank (?).","mathLimitProblemPrompt":"Find the limit and fill in (?). (Types: polynomial, constant, x→∞, ε-δ concept)","mathLimitProblemPromptDirect":"Find the limit (fill in ?).","mathLimitProblemPromptConstant":"Find the limit of the constant.","mathLimitProblemPromptLinear":"Find the limit of the linear expression.","mathLimitProblemPromptAtInfinity":"Find the limit as x → ∞.","mathLimitProblemPromptEpsilon":"Enter the number that matches the ε-δ definition.","mathLimitProblemEpsilonQuestion":"In ε-δ, what does δ represent?","mathLimitProblemEpsilonHint":"(1=distance, 2=error)","mathContinuityProblemPrompt":"Continuity: find the limit or whether the function is continuous. Fill in (?).","mathContinuityProblemPromptLimitPoly":"Polynomial is continuous, so limit = function value. Fill in (?).","mathContinuityProblemPromptLimitLinear":"Find the limit of the linear expression (equals the function value).","mathContinuityProblemPromptYesNo":"Enter 1 if continuous at that point, 0 if discontinuous.","mathContinuityProblemPromptLimitAtHole":"Find the limit at the point where there is a hole.","mathContinuityProblemAtPoint":" at ","mathContinuityProblemContinuousQ":" continuous?","mathContinuityProblemLimitAtHoleIntro":"A function with a hole at","mathContinuityProblemLimitAtHoleQ":"has limit = ?","mathDerivativeProblemPrompt":"Derivative: find the derivative (slope of the tangent) at the given point and fill in (?).","mathDerivativeProblemPromptPower":"Power rule: $(x^n)' = n x^{n-1}$. Find $f'(x)$ at the given point.","mathDerivativeProblemPromptLinear":"Linear: $(mx+b)' = m$. Find $f'(x)$ at the given point.","mathDerivativeProblemPromptPoly2":"Quadratic derivative. Find $f'(x)$ at the given point.","mathDerivativeProblemPromptConstMul":"Constant multiple: $(c \\cdot x^n)' = c \\cdot n \\cdot x^{n-1}$. Find $f'(x)$ at the given point.","mathDerivativeProblemAtPoint":" at","mathChainRuleProblemPrompt":"Chain rule: find $f'(x)$ at the given point and fill in (?). (Types: power, exponential, trig, sqrt, ln, quadratic)","mathPartialGradientProblemPrompt":"Partial derivative & gradient: find the partial derivative or gradient component at the given function and point, and fill in (?).","mlKnnProblemSolvingTable":"**Steps**\n\n| Step | Description |\n| :--- | :--- |\n| **Input** | New feature vector $\\mathbf{x}$ |\n| **Stored** | Labeled examples $(\\mathbf{x}_i, y_i)$ |\n| **1** | Compute distance $d(\\mathbf{x}, \\mathbf{x}_i)$ to each $\\mathbf{x}_i$ |\n| **2** | Select K smallest distances |\n| **3 (classification)** | Predict by **majority vote** of the K labels |\n| **3 (regression)** | Predict **average** of the K values |\n\n---\n\n**Example (distance squared)**\n\nTwo points A(0, 0) and B(3, 4) lie in the plane. Find the distance squared $(x_2-x_1)^2 + (y_2-y_1)^2$.\n\n**Solution**\n\n$(3-0)^2 + (4-0)^2 = 9 + 16 = 25$, so the answer is **25**.","mlLinearRegressionProblemSolvingTable":"$1b","mathIntegralProblemPrompt":"Integral: find the definite integral or antiderivative value and fill in (?).","mathIntegralProblemPromptDefiniteConst":"Find the definite integral of the constant function.","mathIntegralProblemPromptDefiniteLinear":"Find the definite integral of the linear function.","mathIntegralProblemPromptAntiderivative":"Find the value of the antiderivative at the given point.","mathRandomVariableProblemPrompt":"Follow the instruction below.","mathRandomVariableProblemPromptProbSumSix":"Find the blank c so the three probabilities sum to 1.","mathRandomVariableProblemPromptExpectedValueScale6":"Find 6×E[X] = Σ(value × numerator).","mathRandomVariableProblemPromptVarianceShort":"Find 36 times the variance for the distribution below.","mathRandomVariableProblemVarianceHowToCalc":"Variance = how spread out values are from the average. Variance = E[X²]−(E[X])², 36×variance = 6×Σ(nᵢ·xᵢ²) − (Σ nᵢ·xᵢ)²","mathRandomVariableProblemVarianceLabel":"36×variance","mathRandomVariableProblemPromptVarianceScale36":"For the same distribution, Var(X)=E[X²]-E[X]². Find 36×Var(X). (6×Σ(nᵢ·xᵢ²) − (Σ nᵢ·xᵢ)²)","mathRandomVariableProblemPromptVarianceIntro":"For the same distribution, ","mathRandomVariableProblemPromptVarianceMid":". Find ","mathRandomVariableProblemPromptVarianceEnd":". (6×Σ(nᵢ·xᵢ²) − (Σ nᵢ·xᵢ)²)","mathRandomVariableProblemPromptVarianceAsk":". ","mathRandomVariableProblemPromptVarianceFormula":"(6×Σ(nᵢ·xᵢ²) − (Σ nᵢ·xᵢ)²)","mathRandomVariableProblemProbSumHint":"c","mathRandomVariableProblemExpectationHint":"Sum of (value × numerator)","mathRandomVariableProblemVarianceHint":"36×Var(X)","mathRandomVariableProblemPromptMode":"Which value of X has the highest probability? (mode)","mathRandomVariableProblemPromptExpectedValueInt":"Find the expected value E[X] (average value).","mathRandomVariableProblemPromptCumulativeNumerator":"When the probability that X is at most the given value is written as ?/6, find ? (the numerator).","mathRandomVariableProblemModeLabel":"Most likely X","mathRandomVariableProblemExpectedValueIntLabel":"E[X]","mathRandomVariableProblemCumulativeLabel1":"P(X≤1) = ?/6 → ?","mathRandomVariableProblemCumulativeLabel2":"P(X≤2) = ?/6 → ?","mathMeanVarianceProblemPrompt":"Follow the instructions below.","mathMeanVarianceProblemPromptProbSumSix":"Find the blank c so that the three probabilities sum to 1.","mathMeanVarianceProblemPromptMeanScale6":"Find 6×E[X] = Σ(value×numerator).","mathMeanVarianceProblemPromptVarianceShort":"Find 36×variance for the following distribution.","mathMeanVarianceProblemVarianceHowToCalc":"Variance = spread around the mean. 36×variance = 6×Σ(nᵢ·xᵢ²) − (Σ nᵢ·xᵢ)²","mathMeanVarianceProblemVarianceLabel":"36×variance","mathMeanVarianceProblemPromptVarianceScale36":"Find 36×Var(X) for the same distribution.","mathMeanVarianceProblemProbSumHint":"c","mathMeanVarianceProblemMeanScale6Label":"6×mean","mathMeanVarianceProblemMeanIntegerLabel":"Mean E[X]","mathMeanVarianceProblemPromptMeanInteger":"Find the mean (expected value) E[X].","mathMeanVarianceProblemPromptMode":"Find the X value with the highest probability (mode).","mathMeanVarianceProblemPromptCumulativeNumerator":"When P(X≤given) is written as ?/6, find the numerator ?.","mathMeanVarianceProblemModeLabel":"Most likely X","mathMeanVarianceProblemCumulativeLabel1":"P(X≤1) = ?/6 → ?","mathMeanVarianceProblemCumulativeLabel2":"P(X≤2) = ?/6 → ?","mathUniformNormalProblemPrompt":"Follow the instructions below.","mathUniformNormalProblemPromptUniformMean":"For the uniform distribution on [a,b], find the mean (a+b)/2.","mathUniformNormalProblemPromptUniformVar12":"For uniform U[a,b], find 12×variance = (b−a)².","mathUniformNormalProblemPromptUniformLength":"Find the length of the interval [a,b], i.e. b−a.","mathUniformNormalProblemPromptNormalPct68":"In a normal distribution, about what percent of values lie within μ±σ? (Give an integer.)","mathUniformNormalProblemPromptNormalPct95":"In a normal distribution, about what percent of values lie within μ±2σ? (Give an integer.)","mathIntegralProblemAntiderivativeIntro":"Given that","mathIntegralProblemAntiderivativeAt":" what is the value at ","mathIntegralProblemAntiderivativeQ":"?","mathPartialGradientProblemAtPoint":"at","mathPartialGradientProblemGradientFirst":"First component","mathPartialGradientProblemGradientSecond":"Second component","wrongAnswerGuideButton":"Why was it wrong?","wrongAnswerGuideTitle":"Wrong answer guide","wrongAnswerGuideSubmittedAnswer":"Your answer:","wrongAnswerGuideHint":"The AI will infer why you solved it that way and guide you in the right direction without giving the answer.","wrongAnswerGuideApiQuestion":"The user got the problem wrong with the answer \"{answer}\". Infer why they might have solved it that way and guide them in the right direction without revealing the correct answer.","wrongAnswerGuideAsking":"Getting guide...","wrongAnswerQuestionPrompt":"I answered {answer}. Why was it wrong?","getSolution":"Get solution","loadingSolution":"Loading...","feedbackTitle":"AI grading feedback","solutionTitle":"Solution","alertDrawFirst":"Please draw your solution before grading.","alertInputFirst":"Please enter your solution before grading.","errorGrade":"An error occurred while grading. Please try again.","errorSolution":"An error occurred while loading the solution. Please try again.","errorGradeRequest":"Grading request failed","errorSolutionRequest":"Solution request failed","errorStream":"Could not read stream.","errorDefault":"Could not generate feedback.","placeholderChapter":"This chapter is coming soon.","conceptVisualPlaceholder":"A visualization for this concept is coming soon.","conceptComingSoon":"Learning content for this concept will be added in a future update.","conceptMatrixMulIntro":"One row of A · one column of B (dot product) → one entry of the result matrix","conceptMatrixMulCell":"This entry","conceptLinearLayerIntro":"Multiply input X by weight matrix W and add bias b to get output Y. __LINEAR_FORMULA__","conceptLinearLayerLegendRow0":"W row 1·X + b[0] → Y[0]","conceptLinearLayerLegendRow1":"W row 2·X + b[1] → Y[1]","conceptArtificialNeuronIntro":"An artificial neuron computes the weighted sum __WEIGHTED_SUM_FORMULA__ , then applies an activation (e.g. ReLU, Sigmoid, or Tanh) to produce output Y.","conceptArtificialNeuronCalcCaption":"Step by step: (W·X) multiplied + b added = Z → ReLU(Z) = Y","conceptBatchIntro":"A batch stacks multiple samples as columns of a matrix. The same W and b are applied at once: __LINEAR_FORMULA__ .","conceptBatchCaption":"One column = one sample. Same W and b applied to all columns at once.","conceptBatchExampleTitle":"Example: calculation for one column (sample)","conceptBatchFormulaRow":"Z{n} = (W row {row}·this column)+b[{bi}] = ({calc})+({b}) = {result}","conceptConnectionIntro":"Connections describe how neurons in one layer link to the next. Only non-zero weights are actual links; the graph below shows those partial connections.","conceptConnectionGraphCaption":"Connection structure (zero-weight links omitted)","conceptConnectionCalcCaption":"Each output: (W row·X) multiplied + b added = Y","conceptConnectionFormulaRow1":"Y₁ = (W row 1·X) + b₁ = ({calc}) + {b} = {wx} + {b} = {y}","conceptConnectionFormulaRow2":"Y₂ = (W row 2·X) + b₂ = ({calc}) + {b} = {wx} + {b} = {y}","conceptActivationTitleSigmoid":"Y = Sigmoid(X)","conceptActivationTitleRelu":"Y = ReLU(X)","conceptActivationTitleTanh":"Y = Tanh₃(X)","conceptActivationTableHeader":"X ~ Y","conceptDotProductIntro":"a = [{a1}, {a2}], b = [{b1}, {b2}] → a·b = {samePositionSum}","conceptDotProductSamePositionSum":"sum of element-wise products","problemPromptConnection":"In the connection __LINEAR_FORMULA__ , find the value for the blank (?). Inputs with W=0 are not connected to that output.","conceptHiddenIntro":"A hidden layer takes input, applies a linear transform (__LINEAR_CORE__) and ReLU to produce an intermediate representation H, then applies another linear transform and ReLU to produce the final output Y.","conceptHiddenGraphCaption":"Input → Hidden (H) → Output (Y)","problemPromptHidden":"In the forward pass with a hidden layer (X → (W·X+b) → ReLU → H → (W·H+b) → ReLU → Y), fill in the blank (?).","conceptDeepIntro":"A deep network stacks many hidden layers. Each layer applies Linear (W·input + b) and ReLU to produce an intermediate representation before the next layer.","conceptDeepFormulaCaption":"Each layer: Linear & ReLU","conceptDeepFormulaWithSymbols":"Linear = W·(prev layer) + b → ReLU","conceptDeepGraphCaption":"Input (X) → Hidden (A,B,C,D) → Output (Y)","problemPromptDeep":"In the multi-layer forward pass (each layer Linear & ReLU), fill in the blank (?).","conceptWideIntro":"Width means having many neurons in one layer. Wider layers can express more features at once; each layer is computed with Linear & ReLU.","conceptWideFormulaCaption":"Each layer: Linear & ReLU (layer gets wider)","conceptWideGraphCaption":"Input (X) → Hidden (A,B) → Output (Y) → 1→2→4→8 neurons","problemPromptWide":"In the forward pass where layers get wider (each layer Linear & ReLU), fill in the blank (?).","conceptSoftmaxIntro":"Softmax turns numbers into values between 0 and 1 that sum to 1. Compute __WEIGHTED_SUM_FORMULA__, then __SOFTMAX_EXP__, then divide each by the sum (__SOFTMAX_SUM__) to get probabilities.","conceptSoftmaxFormulaCaption":"Z = W·X + b → e^Z (e^x) → Y = e^Z / Σ","conceptSoftmaxGraphCaption":"Often used in the final layer for multi-class classification.","problemPromptSoftmax":"Compute __SOFTMAX_FLOW__ , then fill in the blank (?).","conceptSoftmaxEHint":"In this problem we use e = 3 for easy calculation. So __E_Z_3Z__. (e.g. Z=1 → 3, Z=2 → 9)","conceptGradientIntro":"The gradient is a vector that shows the direction and rate of change of a function. To reduce loss, we update parameters in the opposite direction. Forward: __GRADIENT_FORWARD__; backward: __GRADIENT_BACKWARD__.","conceptGradientForwardLabel":"Forward","conceptGradientBackwardLabel":"Backward","conceptGradientFormulaCaption":"Forward Z = W·X → Backward dZ = dW·X","conceptGradientGraphCaption":"The same idea applies to linear layers, hidden layers, and so on.","conceptGradientBlankHint":"In the problems, the blank (?) is **one entry of X** or **one entry of Z** (forward) / **dZ** (backward).","conceptGradientForwardDesc":"Forward: Z = W·X (each row of W dotted with X)","conceptGradientBackwardDesc":"Backward: dZ = dW·X (same structure, gradient values)","conceptInputX":"Input X","conceptLinear":"Linear","conceptLinearReLULayer1":"Linear & ReLU (layer 1)","conceptLinearReLULayer2":"Linear & ReLU (layer 2)","conceptSoftmaxFlowCaption":"Score (__Z__) → __3Z__ → divide by sum → probability (__Y__)","conceptSoftmaxZLabel":"Z (score)","conceptSoftmaxExpLabel":"3^Z","conceptSoftmaxSumLabel":"Sum","conceptSoftmaxProblemFlow":"Score (__Z__) → __3Z__ → divide by sum (__SIGMA__) → probability (__Y__)","conceptSoftmaxProbability":"Prob.","conceptSoftmaxExampleTitle":"Example: step-by-step calculation","conceptSoftmaxStepZ":"Z{n} = (W row {row}·X)+b[{bi}] = {calc}+{b} = {result}","conceptSoftmaxStepExp":"3^Z{n} = 3^{z} = {result}","conceptSoftmaxStepSum":"Σ = {items} = {result}","conceptSoftmaxStepY":"Y{n} = 3^Z{n}/Σ = {num}/{den} = {result}","conceptWideLinearReLU1":"Linear & ReLU (layer 1, width 2)","conceptWideLinearReLU2":"Linear & ReLU (layer 2, width 4)","conceptWideLayer1Formula":"Layer 1 (width 2): H = ReLU(W₁·X + b₁)","conceptWideLayer2Formula":"Layer 2 (width 4): Y = ReLU(W₂·H + b₂)","conceptMatrixMulCellDot":"Row {row} of A · column {col} of B (one dot product)","conceptMatrixMulARow":"Row {row} of A","conceptMatrixMulBCol":"Column {col} of B","conceptBatchLinear":"Multiply the table by weights and add bias to fill the blank.","conceptBatchLinearRelu":"Multiply by weights, add bias, then set negatives to 0 and fill the blank.","conceptBatchAdd":"Add the right-hand value to each row and fill the blank.","conceptBatchSubtract":"Subtract the right-hand value from each row and fill the blank.","conceptBatchMultiply":"Multiply each row by the right-hand value and fill the blank.","conceptBatchCenter":"Subtract each row's mean from that row and fill the blank.","conceptBatchSum":"Sum all numbers in each row and fill the blank.","conceptBatchMean":"Find the mean (integer) of each row and fill the blank.","conceptBatchRowMeanHint":"(row mean → 0)","conceptBatchRowSumHint":"(row sum)","conceptBatchRowMeanIntHint":"(row mean, integer)","conceptRowN":"row {n}","conceptDeepLayer1Title":"Layer 1: A₁, A₂, A₃ (W₁ each row·X + b₁)","conceptDeepLayer2Title":"Layer 2: B₁, B₂, B₃ (W₂ each row·A + b₂)","conceptDeepFormulaA":"A{n} = (W₁ {row}·X)+b₁[{bi}] = ({calc})+({b}) = {linear} → ReLU = {result}","conceptDeepFormulaAZero":"A{n} = (W₁ {row}·X)+b₁[{bi}] = ({calc})+({b}) = {linear} → ReLU(-1)=0 → {result}","conceptDeepFormulaB":"B{n} = (W₂ {row}·A)+b₂[{bi}] = ({calc})+({b}) = {linear} → ReLU = {result}","conceptHiddenLayer1Title":"Layer 1: H = ReLU(W₁·X + b₁)","conceptHiddenLayer2Title":"Layer 2: Y = ReLU(W₂·H + b₂)","conceptHiddenLinear1":"Linear₁","conceptHiddenLinear2":"Linear₂","conceptHiddenFormulaL1":"{linearLabel} = (W₁ {row}·X)+b₁[{bi}] = ({calc}) + ({b}) = {linear} → ReLU = {result}","conceptHiddenFormulaL2":"{linearLabel} = (W₂ {row}·H)+b₂[{bi}] = ({calc}) + ({b}) = {linear} → ReLU = {result}","conceptWideFormulaH1":"H₁ = (W₁ {row}·X)+b₁[0] = {calc} = {linear} → ReLU = {result}","conceptWideFormulaH2":"H₂ = (W₁ {row}·X)+b₁[1] = {calc} = {linear} → ReLU = {result}","conceptWideFormulaY1":"Y₁ = (W₂ {row}·H)+b₂[0] = {calc} = {linear} → ReLU = {result}","conceptWideFormulaY2":"Y₂ = (W₂ {row}·H)+b₂[1] = {calc} = {linear} → ReLU = {result}","conceptWideFormulaY3":"Y₃ = (W₂ {row}·H)+b₂[2] = {calc} = {linear} → ReLU = {result}","conceptWideFormulaY4":"Y₄ = (W₂ {row}·H)+b₂[3] = {calc} = {linear} → ReLU = {result}","conceptGradientZLine":"Z{n} = (W {row})·X = {calc} = {z}","conceptGradientDZLine":"dZ{n} = (dW {row})·X = {calc} = {dz}","problemPromptGradient":"Fill in the blank (?) in __GRADIENT_FORWARD__ or __GRADIENT_BACKWARD__ .","tinyNNTitle":"Deep learning diagram by chapter","tinyNNDescription":"As you complete each chapter, the diagram below fills in. This is the structure so far.","tinyNNComplete":"By the last chapter you'll see the full picture: forward → loss → backward → update.","tinyNNAriaLabel":"Deep learning diagram progress by chapter","mathDiagramTitle":"Math diagram by chapter","mathDiagramDescription":"Select a chapter to see its diagram below. View the flow of basic math at a glance.","midMathDiagramTitle":"Math diagram by chapter","midMathDiagramDescription":"Select a chapter to see its diagram below. View the flow of intermediate math at a glance.","mathDiagramComplete":"Through Ch01 Functions you see the full input → function → output structure.","mathDiagramAriaLabel":"Math diagram by chapter","mlDiagramTitle":"ML diagram by chapter","mlDiagramDescription":"Select a chapter to see its diagram below. View the machine learning flow at a glance.","mlDiagramAriaLabel":"ML diagram by chapter","introRoadmapHeading":"What you learn in Ch01–Ch12","mathIntroRoadmapIntro":"Understanding deep learning and machine learning requires basic math such as **functions**, **exponential and log**, **limits, derivatives, integrals**, and **probability and distributions**. Ch01–Ch12 cover exactly that. **Functions** are the basis of input→output; **derivatives and gradients** are what the model uses to decide **where and how much** to change parameters when learning; **probability and distributions** are needed for prediction and uncertainty.","premiumBadge":"Premium","premiumTitle":"This is a Premium Chapter","premiumDescription":"This chapter is exclusive to Learn paid subscribers. After subscribing, you get unlimited access to all Learn chapters: concept explanations, problem sets, and AI coaching.","premiumFeature1":"Unlock all Chapters 04–12","premiumFeature2":"Unlimited AI learning coach questions","premiumFeature3":"Early access to new chapters","premiumMonthly":"month","premiumCTA":"Subscribe to Premium","premiumComingSoon":"Coming soon","premiumLogin":"Already subscribed?","premiumLoginLink":"Log in","premiumLoginFirst":"Sign in to subscribe to Premium.","freeChaptersNote":"Chapters 01–03 are free to use."},"playground":{"title":"Mini neural network playground","configTitle":"Model settings","inputNodes":"Input nodes","hiddenNeurons":"Hidden layer neurons","activation":"Activation","createModel":"Create model","inputTarget":"Input and target","runForward":"Run forward","forwardSteps":"Forward steps","training":"Training","oneStep":"One step","epochs50":"50 epochs","weightsAndGradients":"Weights and gradients","linkFromProblem":"How this computation is used in the network","fromDotBanner":"Linked to the dot product exercise. The first neuron in the model below computes the dot product of input and weights. Run Forward to see.","inputXLabel":"Input X (comma-separated)","targetLabel":"Target (comma-separated)","trainingInProgress":"Training…","weightsW1":"W₁ (hidden layer weights)","weightsW2":"W₂ (output layer weights)","gradientsDW1":"dW₁ (gradient)","gradientsDW2":"dW₂ (gradient)","createModelHint":"Select settings above and click \"Create model\".","lossGraphEmpty":"Run training to see the loss per epoch.","lossGraphTitle":"Loss per epoch","epochLabel":"Epoch","lastLossLabel":"Last loss: {value} ({count} epochs total)"},"tinyNN":{"batchPhase0":"Samples 1, 2, 3 are separate.","batchPhase1":"When we merge them into one table → we compute with the same W, b at once.","batchPhase2":"The same W, b applies at once to every column (sample).","batchPhase3":"So output Y also comes out as one table at once.","batchInputSeparate":"Input (samples separate)","batchInputTable":"Input table X","batchSample1":"Sample 1","batchSample2":"Sample 2","batchSample3":"Sample 3","batchOneColOneSample":"One column = one sample","batchMergeHint":"Merging makes one table","batchSameWb":"Same W, b","batchComputeOnce":"Compute at once","batchResultY":"Output Y","batchResultCaption":"← Result from same W, b at once","batchFooter1":"Stacking samples into one matrix lets us compute with the same W, b at once.","batchFooter2":"So when we merge inputs into one table, output Y also comes out as one table at once.","batchFooter3":"One table goes through the same W, b. Only the input differs per column; the rule (W, b) is the same.","connDescription":"Each line between layers is a weight (w). Multiply input by weights, add them, then add bias (b) to get the next layer Y.","connWeightLabel":"weight(w)","connBiasLabel":"+bias(b)","connFooter":"Circles are values, lines are weights (w). Add bias (b) to the weighted sum to get the next layer Y.","hiddenDescription":"We only see input (X) and output (Y). The layer in between is used only inside the network, so it’s the hidden layer.","hiddenVisibleInput":"Visible: input","hiddenHiddenH":"Hidden: H","hiddenVisibleOutput":"Visible: output","hiddenBoxLabel":"Hidden layer (not visible from outside)","hiddenFooter":"Values flow input → hidden → output. The hidden layer is an internal representation we don’t see.","deepDescription":"Deep = many hidden layers (middle steps). The “deep” in deep learning is this depth.","deepLayerN":"Layer {n}","deepFooter":"More steps mean a deeper network. Deeper networks can learn more refined patterns.","wideWidthN":"Width {count}","wideNeuronsN":"{count} neurons","wideFooter":"The number of neurons in one layer is the width. Wider layers can handle more features at once.","softmaxScoreToProb":"Score → probability","softmaxExample":"(example: e ≈ 3)","softmaxScore":"Score","softmaxMid":"Mid","softmaxPowerOf3":"3 to the power","softmaxProb":"Probability","softmaxDivideBySum":"Divide by sum","softmaxRaise":"raised to","softmaxPowerLabel":"(3^{n})","activationDescription":"Representative activation functions where output Y changes nonlinearly with input X. (3-level quantized version)","activationSigmoid":"Sigmoid(X)","activationRelu":"ReLU(X)","activationTanh":"Tanh₃(X)","hiddenLayer1Formula":"W₁·X+b₁ → ReLU","hiddenLayer2Formula":"W₂·H+b₂ → ReLU","captionDotProduct":"Left X1,X2,X3 and right Y1,Y2,Y3 are connected by lines. Each right node is the dot product of the left with weights.","captionMatrixMul":"Left is one row of matrix A; right Y1–Y3 are dot products with columns of B. Together they form the matrix product A·B.","captionLinearLayer":"This block is a linear layer. Input is computed to the next layer at once as Y = W·X + b.","captionActivation":"Node values change in a nonlinear way through ReLU or σ. The last layer Y1, Y2, Y3 come from that.","captionArtificialNeuron":"Inside the dashed circle is one artificial neuron. Input (X) times weights (w·x+b), then ReLU, gives output (Y).","captionBatch":"One column = one sample. The same W, b is applied to all columns at once to compute Y = W·X + b.","captionConnection":"Lines between layers are weights (w). Values flow along these lines to the next layer.","captionHidden":"We only see input (X) and output (Y). The layer H in between is used only inside the network, so it’s the hidden layer. Data flows input → hidden → output.","captionDeep":"Deep means many hidden (middle) layers. More steps like X→A→B→C→…→Y mean deeper; deeper networks learn more refined patterns.","captionWide":"The number of neurons in one layer is the width. 1 neuron = 1 feature, 256 = 256 at once. Width can differ per layer (e.g. 1→2→4→8 or 256→128→64).","captionSoftmax":"Softmax divides so the last layer Y1,Y2,Y3 sum to 1. You can treat them as probabilities.","captionGradient":"Gradient (∇) flows from right to left, updating each layer a bit to reduce loss.","captionSummary":"Ch01–Ch12 in one network: forward, backward, weights, activation, gradient all in one picture.","labelWeightedSum":"Weighted sum","labelWeightBias":"Weight·input+bias","labelWeight":"Weight","labelProbSum":"(probability, sum=1)","labelResult":"Result","labelMatrixResult":"Matrix product result","labelNeuron":"Neuron"},"categories":{"math":{"title":"Foundations","navTitle":"Math"},"midMath":{"title":"Intermediate Math"},"advMath":{"title":"Advanced Math"},"dl":{"title":"Basic Deep Learning","navTitle":"Deep learning"},"midDl":{"title":"Intermediate Deep Learning"},"advDl":{"title":"Advanced Deep Learning"},"ml":{"title":"Basic Machine Learning","navTitle":"Machine learning"},"midMl":{"title":"Intermediate Machine Learning"},"advMl":{"title":"Advanced Machine Learning"},"comingSoon":"Coming soon","preparing":"(Coming soon)","completed":"completed"},"concepts":{"sectionLabels":{"whatIs":"What it is","whyImportant":"Why it matters in deep learning","howUsed":"How it is used","problemSolving":"Tips for solving the problems"},"intro":{"sectionTitle":"What is Deep Learning?","whatIs":["**Deep learning is like a smart calculator that learns by itself** — Instead of humans defining every rule one by one, it's a way for computers to find rules on their own by looking at huge amounts of data. Inspired by **neurons** in the brain exchanging signals, small computing units are stacked in **many layers (Layer)**, which is why we call it **deep** learning.","**Deep learning is everywhere in our lives** — From conversational AI you use every day like **ChatGPT** and **Gemini**, to **self-driving cars** that read the road with cameras, to **Netflix and YouTube recommendation systems** that know your taste better than you do—they're all products of deep learning. The core idea is turning complex images and sounds into **numbers**, then adding and multiplying those numbers to find the right answer.","**You need the basics to build more powerful AI** — Beyond just using ready-made models, knowing the **basic math** that happens inside is important if you want to adapt and use models for your own goals. When you understand how numbers are grouped and computed, you can clearly see why an AI made a certain decision and tune it for better performance.","**What one layer in deep learning does** — Each layer multiplies the incoming numbers by **weights** (importance) and adds them, then passes the result to the next layer. As layers get deeper, the AI goes from dots and lines in the data to eyes, nose, mouth, and finally **high-level features** like dog vs. cat. The guide for adjusting those weights precisely toward the right answer is **gradient**.","**This course's learning roadmap** — Deep learning is ultimately an efficient repetition of multiplication and addition. You'll learn the basics of how data moves through **Ch01 dot product** and **Ch02 matrix multiplication**, go through **Ch03–05 artificial neurons and activation functions**, and grasp **Ch06–10 the structure of deep and wide neural networks**. Finally, in **Ch11–12**, you'll conquer step by step the core idea of how AI learns by itself: the gradient.","Follow the **roadmap** below to see what each chapter aims for. If you follow along step by step, you'll gain the ability to interpret what kind of mathematical language state-of-the-art AI systems use internally."],"whyImportant":[],"howUsed":[],"problemSolving":[]},"dotProduct":{"sectionTitle":"Dot product in deep learning","whatIs":["The **dot product** multiplies **same-position elements** of two vectors and sums the results into a single number. For example, [2, 3] · [4, 1] = 2×4 + 3×1 = 11.","It also measures **how aligned** two vectors are: a large positive dot product means **similar direction**, zero means **perpendicular (unrelated)**, and negative means **opposite direction**. That's why it's great for measuring similarity.","In formula form: **a · b = a₁b₁ + a₂b₂ + … + aₙbₙ**. Both vectors must have the **same number of elements** for the dot product to work."],"whyImportant":["In deep learning, **one neuron's output is computed as a dot product** between its weights and the input. Multiply same-position values and sum them up—that gives the neuron's \"response score\" for that input.","The dot product is the **most fundamental operation** in deep learning because **matrix multiplication is just many dot products bundled together**. Linear layers, attention, embedding comparison—all rely on repeated dot products.","It also serves as a **similarity measure**: for example, Netflix computes the dot product of a user vector and a movie vector to get a \"match score.\" This idea is also called **cosine similarity**."],"howUsed":["**Recommendation systems (Netflix, YouTube)**: Compute the dot product of a user vector and a content vector to get a \"how much this user would like this content\" score. Higher score = higher recommendation rank.","**Search engines & chatbots**: Convert queries and documents to vectors, then rank by dot product (similarity). ChatGPT uses the same principle when finding the most relevant information for your question.","**Attention mechanism**: In translators and chatbots, word vectors are dotted to compute \"relevance scores\"—the model focuses more on words with high scores."],"problemSolving":["**How to compute**: Multiply **same-position elements**, then add all the products. Example: [1, 2, 3] · [4, 5, 6] = 1×4 + 2×5 + 3×6 = 4 + 10 + 18 = 32.","**Finding a blank**: If the total dot product and the other products are given, sum the known products first, then subtract from the total to get the missing product. Divide by the known element to find the blank.","**Watch out**: Both vectors must have the **same number of elements**. Also, make sure to include **every** pair of elements—checking off each pair one by one helps avoid mistakes."],"paragraphs":["The **dot product** is the sum of **element-wise products** of two vectors: a·b = a₁b₁ + a₂b₂ + ….","In deep learning, one step of a linear transform is a **weight vector** dotted with an **input vector**, giving one **neuron**'s output. With many neurons, a **weight matrix** times the input (**matrix multiplication**) computes them at once; each entry is one dot product.","A larger dot product also means the two vectors are more **aligned**, so it is used for **attention**, **similarity**, and **embedding comparison**—measuring how similar two things are with a single number."]},"matrixMul":{"sectionTitle":"Matrix multiplication in deep learning","whatIs":["**Matrix multiplication** combines two number tables (matrices) into a new one. Take **one row** of the first matrix and **one column** of the second, compute their **dot product**, and that fills **one entry** in the result.","Repeat this for **every row-column combination** and the result matrix is complete. For example, a 2×3 matrix times a 3×2 matrix gives a 2×2 result.","The rule for it to work: the **number of columns** of the first matrix must equal the **number of rows** of the second. Remember this, and you can always tell whether two matrices can be multiplied."],"whyImportant":["A **linear layer** in deep learning multiplies the input by a weight matrix—that's matrix multiplication. If you have 10 neurons, you'd need 10 dot products; matrix multiplication does **all 10 at once**.","**GPUs** are specifically designed to do **thousands of matrix multiplications in parallel**. This is why millions of multiplications finish instantly, enabling real-time image recognition and chatbots.","**Nearly every operation** in deep learning boils down to matrix multiplication—attention, convolution, recurrent networks. Understanding matrix multiplication means understanding the backbone of deep learning."],"howUsed":["**Image recognition**: Pixel values are arranged in a matrix, multiplied by weight matrices to extract features like 'is there a dog or a cat?' This repeats across many layers.","**Chatbots & translators**: ChatGPT and Google Translate convert sentences into number matrices, then multiply by huge weight matrices dozens to hundreds of times to generate answers. Matrix multiplication accounts for most of the computation.","**Recommendations & self-driving**: Netflix computing recommendation scores for thousands of users at once, and a self-driving car recognizing obstacles from camera frames—both rely on large-scale matrix multiplication inside."],"problemSolving":["**Finding one entry**: Entry **(i, j)** of the result = dot product of **row i of A** and **column j of B**. Multiply same-position elements and sum.","**Blank strategy**: If the blank is in the result, just compute the dot product for that row and column. If the blank is in A or B, use the known result and other values to work backwards.","**Check dimensions**: Before multiplying, verify that A's **column count** equals B's **row count**. The result size is (A's rows) × (B's columns)."],"paragraphs":["**Matrix multiplication** fills each entry of the result by taking the **dot product** of **each row** of the first matrix and **each column** of the second.","A **linear layer** in deep learning multiplies the input by a **weight matrix** and adds a **bias**; that multiplication is **matrix multiplication**. (m neurons, n inputs → m×n matrix times n-dimensional input → m outputs.)","**GPUs** are optimized for massive **parallel** matrix multiplication, so most of deep learning is **matrix multiplication**."]},"linearLayer":{"sectionTitle":"Linear layer in deep learning","whatIs":["A **linear layer** multiplies the input by **weights (W)** and adds a **bias (b)** to produce output: **Y = W·X + b**. The W·X part is matrix multiplication, and b shifts the baseline up or down.","Think of it like a grading formula: 'math×0.3 + science×0.5 + English×0.2 + 10'. Here 0.3, 0.5, 0.2 are **weights (W)**, 10 is **bias (b)**, and the subject scores are **input (X)**.","A single linear layer decides **'how much to scale each input and how much to add.'** With multiple outputs, each output uses different weights and bias, computing many scores at once."],"whyImportant":["**Almost every deep learning model** uses linear layers as basic building blocks. ChatGPT, translators, and image classifiers all repeat 'W·X + b' hundreds to thousands of times. It's the **brick** of deep learning.","**Model size (parameter count)** is determined by 'how many inputs × how many outputs' for each linear layer. This size controls how complex things the model can learn (**capacity**) vs. the risk of **overfitting** (just memorizing training data).","However, stacking linear layers alone is equivalent to **one linear operation** (only straight lines). That's why an **activation function** (a bending function) is always added after each linear layer to enable **curves and complex patterns**."],"howUsed":["**ChatGPT & translators**: Sentences are converted to number vectors, then passed through dozens to hundreds of linear layers, each computing W·X + b followed by an activation, to understand context and generate answers.","**Image recognition**: Feature vectors from photos are fed into linear layers to compute 'dog score,' 'cat score,' 'bird score' simultaneously. The final linear layer's outputs become per-class scores.","**Recommendation systems**: User info and product info are combined into a vector, fed through linear layers to get a 'how much this user would like this product' score. More layers allow finer recommendations."],"problemSolving":["**One formula**: Multiply input **X** by **weight matrix W** and add **bias b** to get output **Y**. So **Y = W·X + b**. Linear layer problems give you **X, W, b** and ask for **Y**, as in the purple box below.","**Numeric example**: With X = [2, 1], W = [[1,0],[1,1]], b = [1, -1], we get W·X = (2, 3). Adding bias b gives **Y = (2+1, 3-1) = [3, 2]**. The bias shifts each output up or down. Each entry of **Y** is the **dot product** of the corresponding **row of W** with **X**, plus the corresponding entry of **b**.","**Blank strategy**: If the blank is in **Y**, compute that row's W·X + b. If the blank is in **W** or **b**, use the known Y and X and rearrange the equation. Then **verify** by plugging back into Y = W·X + b."],"paragraphs":["A **linear layer** computes y = Wx + b: multiply input x by **weight matrix** W and add **bias** b.","Each output **neuron** is one **dot product** of its weight row with the full input. So **dot product** and **matrix multiplication** are the building blocks of linear layers.","Linear maps alone cannot express **nonlinear** functions well, so linear layers are usually followed by an **activation function**."]},"activation":{"sectionTitle":"Activation in deep learning","whatIs":["An **activation function** transforms a neuron's raw output (weighted sum) into a **specific range or shape**. The most common ones are **ReLU** (negative → 0, positive → unchanged), **Sigmoid** (compresses to 0–1), and **Tanh** (compresses to −1 to 1).","Think of it like a **faucet**: when water (signal) comes in, it either 'only lets through above a threshold (ReLU)' or 'reduces the flow if it's too strong (Sigmoid, Tanh).' This transformation makes the output suitable for the next layer.","**ReLU** is the most popular because it's simple to compute (keep if positive, zero if negative) and trains fast. **Sigmoid** is used when you need probability-like outputs, and **Tanh** when you want values centered around zero."],"whyImportant":["**No matter how many multiply-and-add (linear) operations you stack, the result is the same as one multiply-and-add.** Just as connecting straight lines only gives you a straight line, linear operations alone can **never represent curves or complex patterns**.","Activation functions add **bends (nonlinearity)**. These bends allow stacked layers to create **curves and complex boundaries**, enabling the model to learn patterns in images, speech, and text.","Without activation functions, no matter how deep the network, it can only do **what a single line could do**. Activations are the **essential ingredient** that makes deep learning 'deep.'"],"howUsed":["**Image recognition**: After computing W·X + b at each layer, **ReLU** clips irrelevant features (negatives to zero) and passes relevant ones (positives) to the next layer, progressively extracting 'eyes,' 'ears,' 'wheels,' etc.","**Chatbots & translators**: Hidden layers use **ReLU** or **GELU** (a smoother version) for nonlinearity; the final layer uses **Sigmoid** (yes/no decisions) or **Softmax** (choosing among multiple candidates) to produce the answer.","**Speech recognition & self-driving**: Sound waves or camera images are converted to numbers, then passed through many linear + activation layers to determine 'what word is this' or 'what object is that.' Without activation, such complex decisions would be impossible."],"problemSolving":["Find X's interval in the table; that gives Y.","Function | Rule","ReLU | 0 or less → 0; positive → same as X","Sigmoid | Small → 0, middle → 0.5, large → 1","Tanh₃ | Small → -1, middle → 0, large → 1","Note | Check the problem's table for boundaries."],"paragraphs":["An **activation function** makes a neuron's linear output (**weighted sum**) **nonlinear**. **ReLU**, **sigmoid**, and **tanh** are common.","Stacking only **linear layers** is equivalent to one big linear map. **Nonlinear** activations between layers are needed for **deep networks** to learn complex patterns.","Choosing where and which **activation** to use is a key **design decision** in deep learning."],"problemDiagramCaption":"Node values change in a squiggly way through ReLU or σ. The final layer Y1, Y2, Y3 come out that way.","solutionIntro":"For activation problems, Y is determined by which interval X falls into. Below is how to solve ReLU, Sigmoid, and Tanh₃ problems.","solutionRelu":"**ReLU**: X ≤ 0 → Y = 0, X > 0 → Y = X. If Y is blank, just check the sign of X.","solutionSigmoid":"**Sigmoid**: X < -1.5 → 0, -1.5~1.5 → 0.5, X > 1.5 → 1. Find X's interval from the table/graph and use the corresponding Y. Check the problem's table for boundaries.","solutionTanh":"**Tanh₃**: X ≤ -1 → -1, -1 < X < 1 → 0, X ≥ 1 → 1. Find X's interval from the table and fill Y (-1, 0, or 1). For boundary values, check which side the problem uses.","solutionCaption":"Interval boundaries may differ per problem; always check the table (or graph) given in the problem first."},"artificialNeuron":{"sectionTitle":"Artificial neuron in deep learning","whatIs":["An **artificial neuron** is the **smallest computational unit** of deep learning. It does exactly two steps: ① compute the **weighted sum** Z = W·X + b, ② apply an **activation function** Y = ReLU(Z) or Sigmoid(Z).","It's inspired by biological neurons: real neurons receive multiple signals, weight each one differently, sum them up, and fire if the total exceeds a threshold. The artificial neuron is a **mathematical simplification** of this process.","Summary: **Input (X)** → **Weight and bias (Z = W·X + b)** → **Activation (Y = f(Z))** → **Output (Y)**. That's everything an artificial neuron does."],"whyImportant":["AI models like ChatGPT, image classifiers, and recommendation systems are built by **connecting thousands to billions of these neurons**. Understand one neuron, and you can **read the entire model's behavior**.","**Training** means gradually adjusting each neuron's **weights (W) and bias (b)** so the output gets closer to the correct answer. Knowing how W and b affect the output is key to understanding learning.","A single neuron combines **dot product + bias + activation**, so it unifies everything from the previous chapters: **dot product, matrix multiplication, linear layer, and activation function** all come together here."],"howUsed":["**Real-life analogy—exam pass prediction**: Compute 'Math×0.4 + Science×0.4 + English×0.2 + 5 = 75' (weighted sum), then 'if ≥60 → pass (1), else fail (0)' (activation). That's exactly one neuron's operation.","**One neuron in image recognition**: It takes a specific region of pixels, computes weighted sum + bias, passes through ReLU to get a 'is there a horizontal line here?' score. Thousands of such neurons together can determine 'dog or cat.'","**Chatbots, translators, speech recognition**: Each part of a sentence or sound is converted to numbers, neurons score 'what patterns are present,' and those scores flow to the next layer's neurons to grasp increasingly complex meaning."],"problemSolving":["**Step 1—Weighted sum (Z)**: Compute Z = W·X + b. Dot product W's row with X, then add b. If the blank is in Z, fill it at this step.","**Step 2—Activation (Y)**: Apply the given activation to Z. **ReLU**: Y = Z if Z > 0, Y = 0 if Z ≤ 0. **Sigmoid**: check the table to see which interval Z falls in.","**Blank in W or b**: If Y and X are given, reverse the activation to find Z first, then solve Z = W·X + b for the blank. The key is to **work backwards one step at a time**."],"paragraphs":["An **artificial neuron** takes **weighted** inputs (**weighted sum**), then applies an **activation function** to produce one output.","The weighted sum is a **dot product** of the input and weight vectors; then a **nonlinear** activation is applied.","**Deep learning models** chain many such **neurons** to transform input to output in multiple stages."]},"batch":{"sectionTitle":"Batch in deep learning","whatIs":["A **batch** means **grouping multiple inputs (samples) into one table (matrix) and computing them all at once with the same weights**. Each **column = one sample** in the table.","Imagine a teacher grading tests **one by one** vs. feeding **30 tests into a grading machine** at once—the machine is much faster. Batching works the same way: the GPU processes many inputs **simultaneously**.","Key idea: the **same W (weights) and b (bias)** are applied to all samples. The only thing that differs per sample is the **input X**. That's why one matrix multiplication can compute results for many samples at once."],"whyImportant":["**Speed**: GPUs are optimized for processing **thousands of numbers simultaneously** rather than one at a time. Batching lets you use the GPU's full power, computing **tens to hundreds of times faster** than one-by-one.","**Training stability**: Updating weights based on just 1 sample is **noisy**. Using a **mini-batch** (e.g., 32 or 64 samples) averages the gradients for much more **stable** learning. Batch size is a critical training setting.","**Memory management**: With 1 million data points, you can't fit them all at once (GPU memory!). So you split into **mini-batches** (e.g., 64 at a time), process each batch, update weights, and repeat."],"howUsed":["**Netflix & YouTube recommendations**: Instead of computing for one user at a time, **thousands of users' data are batched** for simultaneous scoring. This enables real-time service.","**ChatGPT & translators**: When many users ask questions at the same time, their queries are **batched together** for one GPU pass. That's how millions of users get fast responses simultaneously.","**Image training**: When training on 100,000 images, they're split into mini-batches of 32, running 3,125 iterations. Each mini-batch computes Z = W·X + b, measures error (loss), and slightly adjusts weights."],"problemSolving":["**X has multiple columns**: Each column is one sample. Use the **same W and b** for each column. Find which row and column the blank is in, and use **only that column's numbers** to compute.","**Add/subtract/multiply/mean operations**: These apply to **same positions (same row, same column)**. For mean (e.g., zero-centering), compute the average **per column**. Use only that column's values for the blank.","**Verification tip**: Each column is independent—one column's result doesn't affect another. **Check each column separately** to catch mistakes easily."],"paragraphs":["**Batching** means grouping several **samples** into one **matrix** and computing with the same **weights** at once.","One **matrix operation** over many samples uses the **GPU** much better than processing one sample at a time.","Training usually computes **gradients** and **updates** weights per **mini-batch**."]},"connection":{"sectionTitle":"Connection in deep learning","whatIs":["A **connection** describes **how neurons in one layer link to neurons in the next layer**. Each connection has a **weight (number)** that determines 'how much this input affects this output.'","**Fully connected**: **Every** neuron in the previous layer connects to **every** neuron in the next. The linear layer (Y = W·X + b) we've learned is exactly a fully connected layer—every entry in W has a number.","**Partially connected**: Some entries in W are **zero**, meaning 'no connection.' That input has **no effect** on that output. CNNs, which connect only nearby pixels, are a classic example of partial connections."],"whyImportant":["**Connection structure defines the model's character.** Fully connected considers all inputs (more information but more parameters), while partial connections only look at what's needed (efficient and fast but may miss some information).","**AI training is the process of adjusting connection strengths (weights).** 'Make this connection stronger, that one weaker'—gradually adjusting to produce outputs closer to the correct answer. Large models have billions of such connections.","**Looking at where W is zero** reveals what the model ignores. After training, connections with near-zero weights indicate 'unimportant information.' This is used in **pruning** to make models lighter."],"howUsed":["**Image recognition (CNN)**: Uses **partial connections** where only nearby pixels connect. Distant pixels are less relevant, so this reduces parameters and is faster and more efficient.","**Chatbots & translators (Transformer)**: **Attention** determines 'which words relate to which other words'—it learns which connections to strengthen **dynamically** from the data.","**Recommendation & speech recognition**: The weights connecting user features to product features directly become recommendation scores. In speech recognition, the model learns how each sound frequency connects to the next layer's features."],"problemSolving":["**W = 0 means no connection**: For example, if W(2,1) = 0, the 1st input has **zero effect** on the 2nd output. You can **skip it** entirely in the calculation.","**Finding one output**: Find which inputs **are connected** (W ≠ 0) to that output, multiply W · X for those positions only, sum them, and add b. Zero entries multiply to zero, so skipping them gives the same result.","**Blank strategy**: First, **identify the zero entries in W**. Then set up equations using only the non-zero connections. If the blank is in W, use Y and X to reverse-calculate; if it's in Y, compute forward from W and X."],"paragraphs":["**Connection** is the structure that shows **how neurons** in one **layer** link to neurons in the next layer.","Networks are often described as **fully connected**, **partially connected**, or **recurrent**. In **fully connected** layers, every neuron in one layer links to every neuron in the next (e.g. a **Linear layer**). **Partially connected** means only some neurons link to the next layer (e.g. in CNNs, filters connect only some inputs). **Recurrent** connections feed output back into the same or earlier step.","Each link has a **weight** that scales the signal. Entry (i,j) of matrix W is the strength from input j to output neuron i; these **weights** are learned.","In deep learning there can be millions to billions of connection weights. In Y = W·X + b, a zero in W means **no connection** from that input to that output (partial connection)."]},"hidden":{"sectionTitle":"Hidden layers in deep learning","whatIs":["A **hidden layer** is an **intermediate stage between input and output**. Users only see the input (e.g., a photo) and output (e.g., 'dog'), but in between, hidden layers create **'hidden features.'**","The flow is: **X → Linear(W·X+b) → ReLU → H (hidden representation) → Linear(W·H+b) → ReLU → Y (output)**. H is the hidden layer's result, containing compressed 'key features' of the input.","**Analogy**: When you see a photo and say 'dog,' your brain goes through 'colors → edges → eyes/nose/ears → dog!' These **intermediate thinking steps** are the hidden layers. The number of neurons (width) in the hidden layer determines how many different features it can capture."],"whyImportant":["Hidden layers **progressively summarize and transform** input data. **Early layers** capture simple features (brightness, edges), **later layers** capture complex features (eyes, wheels, letters).","**Without hidden layers**, the model maps input directly to output, only expressing very simple (linear) relationships. **With hidden layers**, it can learn complex relationships (curves, multi-condition combinations).","The **number of neurons (width)** and **number of layers (depth)** determine the model's **representational power**. Too small = information bottleneck and poor performance; too large = overfitting (memorizing instead of learning)."],"howUsed":["**Image recognition**: The stages 'pixels → edges → textures → object parts (eyes, wheels) → whole objects (dog, car)' are all hidden layers. Deeper layers extract more abstract features.","**Chatbots & translators**: After converting text to numbers, multiple hidden layers progressively refine 'word meaning → sentence context → answer direction.' ChatGPT passes through dozens of hidden layers (Transformer blocks) to generate responses.","**Speech recognition**: The transformation 'sound wave → frequency features → phonemes → words → sentences' goes through hidden layers at each stage."],"problemSolving":["**Compute in order**: X → (W·X+b) → ReLU → H → (W·H+b) → ReLU → Y. Compute each step **sequentially**. If the blank is in H, compute only through the first linear+ReLU. If in Y, compute H first then the second stage.","**ReLU caution**: When the linear result (W·input+b) is **negative, ReLU turns it to 0**. In the next layer, that value is 0, so that term **contributes nothing**—you can ignore it entirely. This is a frequent key point in hidden layer problems.","**Blank in W or b**: Hidden layer problems have **two stages** (two linear+activation). First identify which stage the blank is in. If you know the input and output of that stage, solve for the blank using that stage's equation alone."],"paragraphs":["**Hidden layers** sit between **input** and **output** layers and learn internal **representations** not directly observed.","They gradually transform input into **higher-level features**; **lower layers** capture simple patterns, **higher layers** more abstract ones.","The number of **neurons** and **layers** in hidden layers is a key factor in model **capacity**."]},"deep":{"sectionTitle":"Depth in deep learning","whatIs":["**Deep** means having **many hidden layers (intermediate stages)**. The **'deep'** in **deep learning** refers exactly to this depth! Each layer does linear (W·input+b) + activation (ReLU), then passes the result to the next layer.","**X → A → B → C → … → Y**—the more stages, the deeper. Analogy: with **1 stage** you can only 'draw a line,' with **10 stages** you can 'draw simple shapes,' and with **100 stages** you can 'draw a human face.' More depth = **more precise, complex patterns**.","But deeper isn't always better. Too many layers can cause **vanishing gradients** (learning signals don't reach early layers) or **overfitting** (memorizing training data instead of learning general patterns)."],"whyImportant":["**More layers enable more complex functions.** Each layer's activation adds 'bends,' and stacking layers **combines many bends** into very complex curves and decision boundaries.","In image recognition: **layers 1–2** learn 'lines, edges,' **layers 3–5** learn 'eyes, noses, wheels,' **layer 6+** learn 'dogs, cars.' This is possible because of **depth**.","Famous architectures like **ResNet** and **Transformer** can be **dozens to hundreds of layers** deep and still train well. The secret is **skip connections (residual connections)**: gradients can skip layers and flow directly to earlier layers. These techniques overcome the 'limits of depth.'"],"howUsed":["**ChatGPT**: GPT-4 consists of **dozens to hundreds** of Transformer blocks. Each block understands context more deeply, and the final layer generates the answer.","**Self-driving cars**: Camera images go through **deep networks** (e.g., ResNet-152, 152 layers!) to accurately distinguish obstacles, lane markings, and signs through many stages. Depth enables handling complex road situations.","**Speech recognition & translation**: Converting speech to text, or Korean to English, also goes through **deep networks** where each layer progressively captures 'phonemes → words → context → meaning.'"],"problemSolving":["**Example**: Input X = [3, 1, 2]. Layer 1: W₁·X+b₁ = [4, -1, 2] (linear), then ReLU gives A = [4, 0, 2]. Layer 2: W₂·A+b₂ = [2, 1, 5], ReLU gives B = [2, 1, 5]. If **A₂ is blank**?","**Solution**: The second entry of layer 1 linear output is -1, so ReLU(-1) = 0. So **A₂ = 0**. For a blank in a middle layer, compute that layer's **linear (W·input+b)** first, then apply **ReLU (negative → 0)**.","**In general**: Wherever the blank is, compute **all previous layers** in order to get that layer's input, then take the **dot product of the corresponding row of W with the input**, add the **bias entry**, and apply ReLU to get the answer."],"paragraphs":["**Deep** means having many **hidden layers**—many **layers** in the **network**. That is the 'deep' in **deep learning**.","More depth allows more stages of **nonlinear transformation** and more **complex functions**, but also **harder training**, **overfitting**, and **cost**.","Architectures like **ResNet** and **Transformer** help **train** very deep networks **stably** with **structural techniques**."]},"wide":{"sectionTitle":"Width in deep learning","whatIs":["**Width** refers to **how many neurons are in a single layer**. More neurons (wider) = the layer can **represent more features simultaneously**. For example, 1 neuron = 1 feature; 256 neurons = 256 features at once.","Analogy: if an **exam has 1 question**, you can only evaluate one skill; with **100 questions**, you can assess many abilities at once. Similarly, a wider layer **processes more diverse information** in one step.","Layers can have different widths. For example, '1 → 2 → 4 → 8' (widening) or '256 → 128 → 64' (narrowing) are both common designs, depending on the purpose."],"whyImportant":["**Depth (number of layers)** and **width (neurons per layer)** together determine the model's **total size (parameter count)**. With the same number of parameters, you can choose '**deep and narrow**' or '**shallow and wide**'—and this choice significantly affects performance.","Greater width means **more features processed simultaneously** per layer, but it also increases **computation and memory**. Too wide risks **overfitting** (memorizing training data).","In practice, **bottleneck** designs are popular: keep the input and output narrow but make the middle wide. This way, the **wide layer extracts key features** while the rest stays compressed. Both ResNet and Transformer use this technique."],"howUsed":["**Image recognition (CNN)**: The **channel count** (number of feature maps) at each layer is its width. Starting from 3 channels (RGB), deeper layers grow to 64 → 128 → 256 → 512 channels, extracting **increasingly diverse features**.","**Chatbots & translators (Transformer)**: The **hidden dimension** (e.g., 768, 1024, 4096) is the number of numbers each layer processes at once (its width). Large models like GPT-4 have dimensions in the thousands—very wide.","**Recommendation systems**: A 'user vector of 256 dimensions' means width 256. It holds 256 features (age, preferences, watch history, etc. transformed into numbers), enabling more detailed recommendations."],"problemSolving":["**Same formula per layer even when widening**: Linear (W·input+b) → ReLU. Find which layer and neuron the blank belongs to, then use **that layer's input** and **the corresponding row of W and entry of b** to compute.","**Watch W dimensions**: When width changes between layers, **W's size changes too**. W is (current width × previous width), so find the right **row** for the blank's neuron and dot it with the previous layer's output, plus b.","**Layer by layer**: Just like with depth problems, **compute previous layers' outputs first** before moving to the next. Don't forget ReLU (negative → 0) at each layer."],"paragraphs":["**Width** is the number of **neurons** (or **channels**) in one layer. A **wider layer** can represent more **features** at once.","How you balance **depth** (number of layers) and **width** (neurons per layer) affects **capacity** and **efficiency**. With the same **parameters**, you can go deeper or wider.","In practice, **width** often varies per layer to add **capacity** where needed."]},"softmax":{"sectionTitle":"Softmax in deep learning","whatIs":["**Softmax** is a function that **converts multiple scores (numbers) into probabilities**. All values become **between 0 and 1**, and they **sum to exactly 1**. So you can read them as probabilities.","The formula is __SOFTMAX_FORMULA__. Because it uses **powers of e (≈2.718)**, the largest score gets **amplified significantly** while others shrink relatively. The gap between 1st and 2nd place becomes more pronounced.","Example: scores [3, 1, 0] → e³≈27, e¹≈2.7, e⁰=1 → sum ≈ 23.7 → probabilities → [0.84, 0.11, 0.04]. The score of 3 was only 3× larger than 1, but the probability is about 8× larger!"],"whyImportant":["Softmax is used at the **final layer of almost every classification model**. 'This photo is 70% dog, 25% cat, 5% bird' lets you see **per-class probabilities** and **how confident** the model is.","When combined with **cross-entropy loss** during training, the gradients work out **cleanly and stably**. The model naturally learns to 'increase the correct class probability and decrease the rest.'","Softmax's property of 'all positive values that sum to 1' exactly matches the definition of a **probability distribution**. This makes it the **most natural way** to convert scores to probabilities, both statistically and theoretically."],"howUsed":["**Image classification**: The model's final layer outputs scores (logits) like [5.2, 2.1, 0.8, ...]. Softmax converts them to [0.70, 0.25, 0.05, ...]—**probabilities for each class**. The highest probability class is the final answer.","**Chatbots & translators**: When ChatGPT picks the next word, it scores every word in its vocabulary (tens of thousands!), converts to probabilities via softmax, and samples a word based on those probabilities. High-probability words appear often, but occasionally low-probability words are picked for variety.","**Attention mechanism**: In translators, relevance scores for 'which input words to focus on' are passed through softmax to become probabilities (weights). These weights create a **weighted average** of inputs that emphasizes the most relevant parts."],"problemSolving":["**Computation order**: ① Compute __WEIGHTED_SUM_FORMULA__ (logits) ② Compute __SOFTMAX_EXP__ (problem uses __E_APPROX_3__) ③ Compute __SOFTMAX_SUM__ (sum) = add all __SOFTMAX_EXP__ values ④ __SOFTMAX_Y_DIV__ (divide each by the sum). Follow this order.","**Finding blanks**: If Y is blank, compute 'that __SOFTMAX_EXP_DIV_SUM__.' If __SOFTMAX_EXP__ is blank, compute '__Y_TIMES_SUM__.' If Z is blank, reverse from __SOFTMAX_EXP__. If __SOFTMAX_SUM__ is blank, just add all __SOFTMAX_EXP__ values.","**Verification**: After computing, check that all Y values are **between 0 and 1** and **sum to 1**. If not, there's a calculation error. Also confirm whether the problem uses __E_APPROX_3__ or __E_APPROX_2718__."],"paragraphs":{"0":"**Softmax** maps a vector to values in **(0,1)** that **sum to 1**, so it can be interpreted as a **probability distribution**.","1":"In **classification**, applying softmax to the last layer gives **class** **probabilities** and is typically used with **cross-entropy loss**.","2":"The formula is __SOFTMAX_FORMULA__; the **exponent** **amplifies** the largest value."}},"gradient":{"sectionTitle":"Gradient in deep learning","whatIs":["The **gradient** tells you **'if you change a weight (parameter) slightly, how much and in which direction does the loss (error) change.'** Think of it as a **compass** pointing toward 'which way to go to reduce error.'","**Analogy**: Imagine walking down a mountain blindfolded. You feel the **slope (gradient) under your feet** and step toward the downhill direction. Walking **opposite to the gradient** leads you to the valley (minimum loss). This is **gradient descent**.","**Backpropagation** passes gradients **from the output back toward the input, one layer at a time**. Using the **chain rule** from calculus, it efficiently computes the gradient for every weight in every layer **in one pass**."],"whyImportant":["**AI training = looking at gradients and updating weights.** Without gradients, there's no way to know 'which direction to adjust,' making **learning impossible**. The gradient is the **heart** of deep learning training.","**Learning rate** controls 'how far to step each time.' Too large → overshoot the valley (diverge); too small → takes forever to arrive. Optimizers like **Adam** automatically **adjust the step size** based on gradient magnitude.","If gradients get **too large (gradient explosion)**, training becomes unstable; if they get **too small (gradient vanishing)**, early layers barely learn. Techniques like **gradient clipping**, **batch normalization**, and **skip connections** are used to prevent this."],"howUsed":["**Every trained AI model**: ChatGPT, image recognition, recommendation systems—**every model** computes gradients to update weights. Forward pass → compute loss → backward pass for gradients → update weights. Repeating these 4 steps millions of times is training.","**Forward and backward**: Forward computes Z = W·X going **forward**; backward propagates gradients dW, dX going **backward**. They always work as a pair.","**Fine-tuning**: When adapting ChatGPT for a specific use case, new data is used to compute gradients and slightly adjust weights. Thanks to gradients, a **pre-trained model** can quickly adapt to new purposes."],"problemSolving":["**Problem format**: The equation is either **forward Z = W·X** or **backward dZ = dW·X**. The blank (?) is **one entry of X** or **one entry of Z** (or **dZ**). W and dW are always fully given.","**Forward (Z = W·X)**: Each entry of Z = dot product of **one row of W** with **X**. If the blank is in **Z**, multiply that row of W by X and sum. If the blank is in **X**, use the other Z entries and rows of W to set up an equation and solve for that X entry.","**Backward (dZ = dW·X)**: **Same computation** as forward. Each entry of dZ = dot product of **one row of dW** with **X**. If the blank is in **dZ**, dot that row of dW with X. If the blank is in **X**, solve from the equation."],"paragraphs":["The **gradient** is the vector of **partial derivatives** of the **loss** with respect to each **parameter**—how much and in which **direction** the loss changes.","**Training** usually moves parameters in the **opposite direction** of the gradient (**gradient descent**). Gradients are computed efficiently by **backpropagation**.","**Learning rate**, **optimizer**, and **gradient clipping** are **key settings** for how gradients are used."]},"summary":{"sectionTitle":"Summary","whatIs":["The diagram below **collects everything from Ch01–Ch12** into **one network**: input X → hidden layers (A, B, C, D) → output Y, with **weights (W)**, **activation (ReLU, etc.)**, **batch**, and **gradient (∇)** shown.","Real training repeats **forward pass** (compute output) → **loss** → **backward pass** (gradients) → **update weights**. After this course you can follow that flow in the math."],"whyImportant":[],"howUsed":[],"problemSolving":[]}},"locale":{"ko":"Korean","ja":"Japanese","en":"English","zh":"Chinese (Simplified)"},"chapters":{"intro":{"chapter":"Chapter 00","title":"First steps in deep learning: How does AI think?","description":"Find out at a glance what deep learning is and what you'll learn in Ch01–Ch12."},"dotProduct":{"chapter":"Chapter 01","title":"Vector dot product: Finding similarity between data","description":"The most basic operation: multiplying two vectors' direction and magnitude into a single value."},"matrixMul":{"chapter":"Chapter 02","title":"Matrix multiplication: The magic of computing at once","description":"The product of two matrices is a new matrix filled with dot products of rows of the first and columns of the second."},"linearLayer":{"chapter":"Chapter 03","title":"Linear layer: Weights that decide importance","description":"Linear layer (or linear transformation layer). A layer that multiplies the input by a weight matrix and adds bias."},"activation":{"chapter":"Chapter 04","title":"Activation function: Adding judgment to AI","description":"Activation function. A function that makes a neuron's output nonlinear."},"artificialNeuron":{"chapter":"Chapter 05","title":"Artificial neuron: A unit that gathers information and sends signals","description":"Artificial neuron. A unit that takes input, computes a weighted sum, and applies an activation function."},"batch":{"chapter":"Chapter 06","title":"Batch processing: Learning together in one go","description":"Batch. A unit that processes multiple samples in one computation."},"connection":{"chapter":"Chapter 07","title":"Weight connections: The countless chains that build intelligence","description":"Connections. The weighted links between layers and between neurons."},"hidden":{"chapter":"Chapter 08","title":"Hidden layer: The invisible depth of thought","description":"Hidden. Layers between the input and output layers."},"deep":{"chapter":"Chapter 09","title":"Deep network: The power to solve more complex problems","description":"Depth. A network with many hidden layers is called a deep network."},"wide":{"chapter":"Chapter 10","title":"Width and neurons: Finding more features at once","description":"Width. A layer with many neurons is called a wide layer."},"softmax":{"chapter":"Chapter 11","title":"Softmax: Turning results into confidence","description":"Softmax (probability distribution). Transforms output so values are between 0 and 1 and sum to 1."},"gradient":{"chapter":"Chapter 12","title":"Gradient and backpropagation: Learning from mistakes","description":"Gradient. Tells which direction to move parameters to reduce loss."},"summary":{"chapter":"Chapter 13","title":"Summary: A map of AI at a glance","description":"You can see what you learned in Ch01–Ch12 in one neural network diagram."}},"midMathChapters":{"midMath00":{"chapter":"Chapter 00","title":"Intermediate Math and AI: Multivariable Space and Uncertainty"},"midMath01":{"chapter":"Chapter 01","title":"Vectors and Vector Space: Magnitude and Direction Beyond Scalars"},"midMath02":{"chapter":"Chapter 02","title":"Dot Product and Projection: Angle and Similarity Between Data"},"midMath03":{"chapter":"Chapter 03","title":"Matrices and Data: Structural Representation of Many Vectors"},"midMath04":{"chapter":"Chapter 04","title":"Matrix Multiplication and Linear Transformation: Math That Manipulates Space"},"midMath05":{"chapter":"Chapter 05","title":"Inverse and Determinant: Inverse of Transformation and Change in Volume"},"midMath06":{"chapter":"Chapter 06","title":"Linear Independence and Rank: Redundancy and Effective Dimension"},"midMath07":{"chapter":"Chapter 07","title":"Eigenvalues and Eigenvectors: Principal Axes Unchanged by Transformation"},"midMath08":{"chapter":"Chapter 08","title":"Directional Derivative and Gradient: Steepest Ascent in Multidimensional Space"},"midMath09":{"chapter":"Chapter 09","title":"Jacobian Matrix: First Derivatives of Multivariable Vector Functions"},"midMath10":{"chapter":"Chapter 10","title":"Hessian Matrix: Second Derivatives and Curvature of Surfaces"},"midMath11":{"chapter":"Chapter 11","title":"Taylor Series: Approximating Complex Functions with Polynomials"},"midMath12":{"chapter":"Chapter 12","title":"Convex Optimization: Conditions for Finding the Minimum"},"midMath13":{"chapter":"Chapter 13","title":"Conditional Probability and Dependence: Probabilistic Relations Between Variables"},"midMath14":{"chapter":"Chapter 14","title":"Bayes' Theorem: Updating Probability with Observed Data"},"midMath15":{"chapter":"Chapter 15","title":"Covariance and Correlation: Measuring Linear Association Between Two Variables"},"midMath16":{"chapter":"Chapter 16","title":"Multivariate Normal Distribution: Joint Probability Model for Many Variables"},"midMath17":{"chapter":"Chapter 17","title":"Maximum Likelihood Estimation (MLE): Inferring Parameters from Observations"},"midMath18":{"chapter":"Chapter 18","title":"Entropy: Quantifying Uncertainty via Information Theory"},"midMath19":{"chapter":"Chapter 19","title":"Cross-Entropy and KL Divergence: Measuring Difference Between Two Distributions"},"midMath20":{"chapter":"Chapter 20","title":"Intermediate Math Summary: Linear Algebra and Probability Combined"}},"midMathCh10":{"chapter":"Chapter 10","title":"Hessian Matrix: Reading the Curvature of Surfaces","description":"The Hessian matrix is a square matrix of second-order partial derivatives of a scalar function. It encodes how much a surface curves at a point and is used to classify minima, maxima, and saddle points in optimization, and forms the basis of Newton's method and trust-region methods.","sectionTitle":"Hessian Matrix: Reading the Curvature of Surfaces","sectionLabels":{"whatIs":"What it is","whyImportant":"Why it matters","howUsed":"How it's used","problemSolving":"Problem-solving guide"},"whatIs":{"intro":"**What is the Hessian matrix?** — Think of it as a table of numbers that describe how much the surface curves in every direction at the point where you stand. It is a square matrix built from second derivatives of the function, and it is symmetric (same on both sides of the diagonal).","plain":"Imagine walking downhill with your eyes closed. What you feel under your feet—\"this way is steeper down\"—is the first derivative (gradient). The sense of \"if I take one more step, will the ground bowl down or stay flat?\" is the second derivative, i.e. the Hessian. With it you can avoid cliffs and find the true bottom, like the bottom of a bowl.","definition":"More precisely, the Hessian $\\mathbf{H}$ is the table whose $(i,j)$ entry is $H_{ij} = \\frac{\\partial^2 f}{\\partial x_i \\partial x_j}$—the function $f$ differentiated twice, once in each of the $x_i$ and $x_j$ directions. The **eigenvalues** of this matrix are what matter: all positive → **local minimum** (bowl), all negative → **local maximum** (dome), mixed signs → **saddle point** (up in one direction, down in another).","inAI":"In machine learning, training is about finding the \"valley\" where the error is smallest. Moving only by gradient is slow. Using the Hessian to read curvature lets you take **Newton-style** jumps toward the bottom and learn much faster."},"whyImportant":{"fakeBottom":"On the way down you may hit a flat spot where the gradient is zero. That does not mean you have reached the true bottom—it could be a saddle (flat in one place but up one way and down another). The **eigenvalues** of the Hessian tell you whether it is a true minimum or a saddle. When there are many variables (as in AI), avoiding these fake bottoms is crucial.","smartStep":"You want small steps on narrow paths and larger steps on open ground. The Hessian tells you \"how steep each direction is,\" so you can set step size (learning rate) well and descend efficiently without wasted moves."},"howUsed":{"newton":"Newton's method moves a lot in one step with: $\\mathbf{x}_{k+1} = \\mathbf{x}_k - \\mathbf{H}^{-1} \\nabla f(\\mathbf{x}_k)$. Here $\\mathbf{x}_k$ is the current point, $\\nabla f(\\mathbf{x}_k)$ is the gradient there, $\\mathbf{H}$ is the Hessian at that point, and $\\mathbf{H}^{-1}$ is its inverse. So you look at both the gradient and the curvature (Hessian) and jump toward the bottom to $\\mathbf{x}_{k+1}$. That can reach the answer much faster than small gradient-only steps.","quasiNewton":"When there are many variables, computing the Hessian exactly is costly. In practice, **quasi-Newton** methods (e.g. BFGS) approximate the Hessian from past gradient information instead of computing it fully, and are used more often."},"summary":"The Hessian is a symmetric matrix of second partial derivatives of a scalar function and encodes curvature and the nature of critical points. At a point where the gradient is zero, all positive eigenvalues imply a local minimum, all negative a local maximum, and mixed signs a saddle point. In machine learning it underlies second-order optimization such as Newton's method, trust-region, and quasi-Newton methods.","problemSolving":{"focus":"The table below lists only **formulas and symbol meanings** needed for problem-solving. See the **worked examples** under the table for step-by-step solutions.","examplesHeading":"Worked examples","examplesTable":"$1c"},"problemSolvingLabel":"Problem-solving guide","problemSolvingTable":"$1d","problemSolvingExample1":"**Example (entry count)**\n\nFor $f(x_1,x_2)$, the Hessian is $2\\times2$, so 4 entries; 3 independent. → **Answer 4** (total) or **3** (independent, by context)","problemSolvingExample2":"**Example (extrema)**\n\nIf eigenvalues are 2 and 5 (both positive), the point is a local minimum. → **Answer 1** (minimum) or the number asked","problemSolvingExample3":"**Example (Newton step)**\n\n$f(x)=x^2$ gives $f'(x)=2x$, $f''(x)=2$. At $x_0=4$, $x_1 = x_0 - f'(x_0)/f''(x_0) = 4 - 8/2 = 0$. → **Answer 0**","visualShort":"Hessian: second partial derivatives → curvature and extrema","visualIntroShort":"The first derivative tells you \"which way is downhill\"; the second (Hessian) tells you \"will the surface bowl down, or go up in one direction and down in another (saddle point)?\" Follow the animation below.","visualWhyHessian":"The Hessian is the matrix of **second derivatives**, so the \"curvature\" in the figure below is exactly what the Hessian describes.","visualIntro":"The Hessian is the matrix of second partial derivatives of $f$ at $\\mathbf{x}$ and is used to read curvature and to classify minima, maxima, and saddle points.","visualConceptTitle":"Concept structure","visualConceptStep0":"Input: scalar function $f(\\mathbf{x})$, point $\\mathbf{x}$","visualConceptStep1":"Compute $\\frac{\\partial^2 f}{\\partial x_i \\partial x_j}$","visualConceptStep2":"Form Hessian $\\mathbf{H}$ (symmetric)","visualConceptStep3":"Eigenvalues → minimum (all +), maximum (all −), saddle (mixed)","visualFlowTitle":"Learning flow","visualFlowStep0":"Concept: second-derivative matrix","visualFlowStep1":"Intuition: curvature of the surface","visualFlowStep2":"Math: $H_{ij}$, symmetry, eigenvalues","visualFlowStep3":"Use: Newton, extrema, trust region","visualCaption":"Left: bowl (only curves down) → minimum. Inverted bowl (only curves up) → maximum. Saddle: one direction up, the other down → neither min nor max.","visualStep1":"Input: scalar function $f(\\mathbf{x})$, point $\\mathbf{x}$","visualStep2":"Compute 2nd partials $\\frac{\\partial^2 f}{\\partial x_i \\partial x_j}$","visualStep3":"Form Hessian matrix $\\mathbf{H}$ (symmetric)","visualStepsLabel":"Order to read","visualBowlTitle":"Bowl: curves only down → minimum","visualSaddleTitle":"Saddle: value ↑ this way, value ↓ that way","visualCurveDown":"↓ curvature","visualFppMin":"f″=2 > 0 → min","visualMinPoint":"Minimum","visualValueUp":"value↑","visualValueDown":"value↓","visualSaddleOrangeGreen":"Orange direction: value goes up · Green direction: value goes down","visualSaddleNeither":"Saddle: neither minimum nor maximum","visualSummary1":"Bowl curves only down → here is the minimum","visualSummary2":"Inverted bowl curves only up → here is the maximum","visualSummary3":"Saddle: one direction up, the other down → neither min nor max","problemPromptIntro":"Read the instructions below, find the answer (integer), and enter it in the blank (?).","promptDefinition":"If the following statement is true enter 1, if false enter 0.\n\n(Statement about Hessian / second derivative / extrema)","promptDefinitionChoice":"Choose the option that matches the question. Enter one number (1, 2, 3) for ①minimum ②maximum ③saddle.\n\n(Hessian eigenvalue / definition question)","promptElementCount":"When $f(\\mathbf{x})$ is a function of {n} variables, how many entries does the Hessian have? (integer)","promptIndependentCount":"For a symmetric Hessian with $n={n}$ variables, how many independent entries? (integer)","promptMatrixSize":"For a function of $n={n}$ variables, how many rows (or columns) does the Hessian have? (integer)","promptEigenvalueType":"When the Hessian eigenvalues are $\\lambda_1={ev1}$, $\\lambda_2={ev2}$, what is this point? ①min ②max ③saddle. Enter the number (1, 2, 3).","promptNewton1D":"For $f(x)={a}x^2{bVal}x+{c}$ with $x_0={x0}$, what is $x_1$ after one Newton step? (integer)","promptScalarSecondDeriv":"For $f(x)={a}x^2+bx+c$, what is the value of the second derivative $f''(x)$? (integer)","promptDefault":"Enter the answer (integer)."},"advMathChapters":{"advMath00":{"chapter":"Chapter 00","title":"Advanced Math and AI: Generative Theory and Complex-System Modeling","description":"Advanced math for AI: multidimensional analysis, complex distributions, and deep learning. Curriculum for generative models and reinforcement learning."},"advMath01":{"chapter":"Chapter 01","title":"SVD and Pseudoinverse: Extracting Latent Patterns from Data","description":"SVD and pseudoinverse for latent patterns. PCA, recommendation systems. Advanced math Ch.01."},"advMath02":{"chapter":"Chapter 02","title":"Tensor Algebra and Einstein Notation","description":"Tensor algebra, Einsum, contraction. Neural network and attention notation. Advanced math Ch.02."},"advMath03":{"chapter":"Chapter 03","title":"Lagrange Multipliers and KKT: Constrained Optimization","description":"Lagrange multipliers and KKT for constrained optimization. SVM and constrained RL. Advanced math Ch.03."},"advMath04":{"chapter":"Chapter 04","title":"Markov Chain: State Transitions and Stochastic Processes","description":"Markov chains, transition matrix, stationarity. MCMC and RL basics. Advanced math Ch.04."},"advMath05":{"chapter":"Chapter 05","title":"Monte Carlo Integration: Numerical Approximation","description":"Monte Carlo integration for high-dimensional expectations. Used in RL and Bayesian inference. Advanced math Ch.05."},"advMath06":{"chapter":"Chapter 06","title":"MCMC: Sampling from Complex Probability Distributions","description":"MCMC, Gibbs and Metropolis-Hastings. Sampling from complex posteriors. Advanced math Ch.06."},"advMath07":{"chapter":"Chapter 07","title":"EM Algorithm: Inference with Latent Variables","description":"EM algorithm: E-step, M-step, latent variable models. GMM, HMM. Advanced math Ch.07."},"advMath08":{"chapter":"Chapter 08","title":"MAP Estimation: Bayesian Optimization and Regularization","description":"MAP estimation, priors, L1/L2 regularization. Bayesian deep learning. Advanced math Ch.08."},"advMath09":{"chapter":"Chapter 09","title":"Conjugate Prior: Analytical Bayesian Inference","description":"Conjugate priors for tractable posteriors. Beta, Dirichlet. Advanced math Ch.09."},"advMath10":{"chapter":"Chapter 10","title":"JS Divergence and Mutual Information","description":"JS divergence and mutual information. GANs and information theory. Advanced math Ch.10."},"advMath11":{"chapter":"Chapter 11","title":"Variational Inference: Approximating Intractable Probabilities","description":"Variational inference, KL minimization, approximate posteriors. Core of VAE. Advanced math Ch.11."},"advMath12":{"chapter":"Chapter 12","title":"Reparameterization Trick: Differentiating Randomness","description":"Reparameterization trick for differentiable sampling. VAE training. Advanced math Ch.12."},"advMath13":{"chapter":"Chapter 13","title":"Optimal Transport and Wasserstein Distance","description":"Wasserstein distance, Earth Mover. WGAN when supports do not overlap. Advanced math Ch.13."},"advMath14":{"chapter":"Chapter 14","title":"MDP and Bellman Equation: Mathematical Basis of Reinforcement Learning","description":"MDP and Bellman equation. States, actions, rewards, value functions. RL math. Advanced math Ch.14."},"advMath15":{"chapter":"Chapter 15","title":"Fourier Transform and Spectral Analysis","description":"Fourier transform and frequency domain. Time series, images, CNN. Advanced math Ch.15."},"advMath16":{"chapter":"Chapter 16","title":"Graph Laplacian: Mathematizing Network Structure","description":"Graph Laplacian, adjacency, degree. GNN, smoothing. Advanced math Ch.16."},"advMath17":{"chapter":"Chapter 17","title":"SDE Basics: Continuous Injection of Noise","description":"SDE and Brownian motion. Diffusion forward process. Advanced math Ch.17."},"advMath18":{"chapter":"Chapter 18","title":"Langevin Dynamics and Score Matching","description":"Langevin dynamics and score matching. Diffusion reverse process. Advanced math Ch.18."},"advMath19":{"chapter":"Chapter 19","title":"Information Geometry and Natural Gradient","description":"Information geometry, Fisher matrix, natural gradient. Optimization on manifolds. Advanced math Ch.19."},"advMath20":{"chapter":"Chapter 20","title":"Advanced Math Summary: Generative Models and Deep Optimization","description":"How SDE, VI, optimal transport, and information geometry appear in VAE, GAN, Diffusion, LLM. Advanced math Ch.20."}},"midDlChapters":{"midDl00":{"chapter":"Chapter 00","title":"Intermediate DL: Stable Training and Unstructured Data"},"midDl01":{"chapter":"Chapter 01","title":"Weight Initialization"},"midDl02":{"chapter":"Chapter 02","title":"Optimization: Momentum and Adaptive Learning Rate"},"midDl03":{"chapter":"Chapter 03","title":"Learning Rate Scheduling"},"midDl04":{"chapter":"Chapter 04","title":"Loss Functions: Class Imbalance and Metric Learning"},"midDl05":{"chapter":"Chapter 05","title":"Regularization and Overfitting Prevention"},"midDl06":{"chapter":"Chapter 06","title":"Batch & Layer Normalization"},"midDl07":{"chapter":"Chapter 07","title":"Data Augmentation and Noise Robustness"},"midDl08":{"chapter":"Chapter 08","title":"CNN Basics: Spatial Feature Extraction"},"midDl09":{"chapter":"Chapter 09","title":"Pooling and Multi-Channel"},"midDl10":{"chapter":"Chapter 10","title":"Skip Connection and ResNet"},"midDl11":{"chapter":"Chapter 11","title":"Efficient Convolution: MobileNet"},"midDl12":{"chapter":"Chapter 12","title":"Vision Transfer Learning"},"midDl13":{"chapter":"Chapter 13","title":"Object Detection (YOLO, SSD)"},"midDl14":{"chapter":"Chapter 14","title":"Image Segmentation (U-Net)"},"midDl15":{"chapter":"Chapter 15","title":"NLP Preprocessing and Tokenization"},"midDl16":{"chapter":"Chapter 16","title":"Word Embedding (Word2Vec, GloVe)"},"midDl17":{"chapter":"Chapter 17","title":"1D CNN for Sequence Processing"},"midDl18":{"chapter":"Chapter 18","title":"RNN: Sequential State"},"midDl19":{"chapter":"Chapter 19","title":"LSTM and GRU: Long-Range Dependencies"},"midDl20":{"chapter":"Chapter 20","title":"Encoder-Decoder and Attention"},"midDl21":{"chapter":"Chapter 21","title":"Intermediate DL Summary"}},"midMlChapters":{"midMl00":{"chapter":"Chapter 00","title":"Intermediate ML: Real-World Data Limits and Model Optimization"},"midMl01":{"chapter":"Chapter 01","title":"Data Scaling and Distribution Transformation"},"midMl02":{"chapter":"Chapter 02","title":"Categorical Encoding"},"midMl03":{"chapter":"Chapter 03","title":"Missing Data and Imputation"},"midMl04":{"chapter":"Chapter 04","title":"Imbalanced Data Basics"},"midMl05":{"chapter":"Chapter 05","title":"Advanced Cross Validation"},"midMl06":{"chapter":"Chapter 06","title":"Multiclass Evaluation and ROC-AUC"},"midMl07":{"chapter":"Chapter 07","title":"SVM Basics: Decision Boundary and Margin"},"midMl08":{"chapter":"Chapter 08","title":"Kernel Trick: Nonlinear SVM"},"midMl09":{"chapter":"Chapter 09","title":"Dimensionality Reduction 1: PCA"},"midMl10":{"chapter":"Chapter 10","title":"Ensemble: Bagging and Pasting"},"midMl11":{"chapter":"Chapter 11","title":"Boosting Basics: AdaBoost"},"midMl12":{"chapter":"Chapter 12","title":"Gradient Boosting Machine (GBM)"},"midMl13":{"chapter":"Chapter 13","title":"Density-Based Clustering (DBSCAN)"},"midMl14":{"chapter":"Chapter 14","title":"Hierarchical Clustering and Dendrogram"},"midMl15":{"chapter":"Chapter 15","title":"Gaussian Mixture Model (GMM)"},"midMl16":{"chapter":"Chapter 16","title":"Anomaly Detection Basics"},"midMl17":{"chapter":"Chapter 17","title":"Pipeline: Modeling Automation"},"midMl18":{"chapter":"Chapter 18","title":"Hyperparameter Tuning 1: Grid and Random Search"},"midMl19":{"chapter":"Chapter 19","title":"Hyperparameter Tuning 2: Bayesian Optimization (Optuna)"},"midMl20":{"chapter":"Chapter 20","title":"Intermediate ML Summary"}},"advMlChapters":{"advMl00":{"chapter":"Chapter 00","title":"Advanced ML: SOTA Models and Interpretability","description":"Principles of optimized boosting ensembles used in Kaggle and the importance of XAI for interpreting black-box predictions."},"advMl01":{"chapter":"Chapter 01","title":"XGBoost Algorithm","description":"Algorithm that improves on GBM speed and adds regularization to control tree complexity and prevent overfitting."},"advMl02":{"chapter":"Chapter 02","title":"LightGBM Algorithm","description":"Leaf-wise growth for speed and accuracy; contrast with level-wise tree building."},"advMl03":{"chapter":"Chapter 03","title":"CatBoost: Categorical Boosting","description":"Ordered Boosting to avoid target leakage; strong on tabular data with many categories."},"advMl04":{"chapter":"Chapter 04","title":"t-SNE for Manifold Visualization","description":"Nonlinear dimensionality reduction preserving local structure for 2D/3D visualization."},"advMl05":{"chapter":"Chapter 05","title":"UMAP: Topological Geometry","description":"Fast manifold learning preserving local and global structure; alternative to t-SNE."},"advMl06":{"chapter":"Chapter 06","title":"Isolation Forest","description":"Unsupervised anomaly detection using random splits; anomalies need fewer splits to isolate."},"advMl07":{"chapter":"Chapter 07","title":"One-Class SVM","description":"Kernel-based method learning a boundary around normal data; points outside are anomalies."},"advMl08":{"chapter":"Chapter 08","title":"Feature Selection and Importance","description":"Permutation importance and other ways to identify key variables."},"advMl09":{"chapter":"Chapter 09","title":"XAI 1: Partial Dependence Plot (PDP)","description":"Marginal effect of a feature on model prediction; global interpretability."},"advMl10":{"chapter":"Chapter 10","title":"XAI 2: LIME","description":"Local linear approximation to explain individual predictions."},"advMl11":{"chapter":"Chapter 11","title":"XAI 3: SHAP","description":"Shapley values for fair feature attribution to predictions."},"advMl12":{"chapter":"Chapter 12","title":"Time Series Preprocessing and Stationarity","description":"ADF test and differencing for stationarity."},"advMl13":{"chapter":"Chapter 13","title":"ARIMA and SARIMA","description":"Classical statistical forecasting with AR, MA, I, and seasonality."},"advMl14":{"chapter":"Chapter 14","title":"Prophet: Structural Time Series","description":"Trend, seasonality, and holiday effects for interpretable forecasting."},"advMl15":{"chapter":"Chapter 15","title":"Content-Based Filtering","description":"Recommendations from item attributes and similarity (e.g. cosine)."},"advMl16":{"chapter":"Chapter 16","title":"Matrix Factorization","description":"Latent factors for user-item rating prediction."},"advMl17":{"chapter":"Chapter 17","title":"Factorization Machines","description":"Efficient modeling of feature interactions in high-dimensional sparse data."},"advMl18":{"chapter":"Chapter 18","title":"Association Rules and Apriori","description":"Support, confidence, lift; traditional basket analysis."},"advMl19":{"chapter":"Chapter 19","title":"AutoML Basics: PyCaret and FLAML","description":"Automating preprocessing, model selection, and hyperparameter tuning."},"advMl20":{"chapter":"Chapter 20","title":"Advanced ML Summary: SOTA Pipeline and XAI","description":"From XGBoost/LightGBM pipelines to SHAP, time series, and recommender systems."}},"advDlChapters":{"advDl00":{"chapter":"Chapter 00","title":"Advanced DL: Large Models and Generative AI Paradigm"},"advDl01":{"chapter":"Chapter 01","title":"Transformer 1: Self-Attention and Parallelization"},"advDl02":{"chapter":"Chapter 02","title":"Transformer 2: Positional Encoding and Feed-Forward"},"advDl03":{"chapter":"Chapter 03","title":"Transformer Lineage: Encoder (BERT) vs Decoder (GPT)"},"advDl04":{"chapter":"Chapter 04","title":"Attention Optimization: FlashAttention and Sparse Attention"},"advDl05":{"chapter":"Chapter 05","title":"Vision Transformer (ViT) and Image Patches"},"advDl06":{"chapter":"Chapter 06","title":"Self-Supervised Learning"},"advDl07":{"chapter":"Chapter 07","title":"Prompt Engineering and In-Context Learning"},"advDl08":{"chapter":"Chapter 08","title":"PEFT 1: PEFT and LoRA"},"advDl09":{"chapter":"Chapter 09","title":"PEFT 2: QLoRA and Quantization Tuning"},"advDl10":{"chapter":"Chapter 10","title":"Alignment and RLHF"},"advDl11":{"chapter":"Chapter 11","title":"DPO: Alignment without Reinforcement Learning"},"advDl12":{"chapter":"Chapter 12","title":"RAG: Hallucination Control Architecture"},"advDl13":{"chapter":"Chapter 13","title":"LLM Agents and Tool Use"},"advDl14":{"chapter":"Chapter 14","title":"GNN and Message Passing"},"advDl15":{"chapter":"Chapter 15","title":"XAI in Deep Learning: Grad-CAM"},"advDl16":{"chapter":"Chapter 16","title":"Autoencoder and Unsupervised Dimensionality Reduction"},"advDl17":{"chapter":"Chapter 17","title":"VAE: Probability-Based Generative Space"},"advDl18":{"chapter":"Chapter 18","title":"GAN Basics"},"advDl19":{"chapter":"Chapter 19","title":"Conditional GAN (cGAN) and Applications"},"advDl20":{"chapter":"Chapter 20","title":"Diffusion Model 1: Forward and Reverse Process"},"advDl21":{"chapter":"Chapter 21","title":"Diffusion Model 2: Latent Diffusion"},"advDl22":{"chapter":"Chapter 22","title":"Vision-Language Model and CLIP"},"advDl23":{"chapter":"Chapter 23","title":"Speech-to-Text and Audio Processing"},"advDl24":{"chapter":"Chapter 24","title":"Model Compression and Knowledge Distillation"},"advDl25":{"chapter":"Chapter 25","title":"Inference Optimization and Deployment"},"advDl26":{"chapter":"Chapter 26","title":"Advanced DL Summary: AI Architecture and Future"}},"mlChapters":{"mlSectionLabels":{"whatIs":"What the concept is","whyImportant":"Why it matters","howUsed":"How it is used","problemSolving":"Summary"},"mlKnnProblemSolvingLabel":"Explanation for solving the problems","mlKnnVisualIntro":"Pick the K=3 nearest neighbors to the new point (?), then predict by majority vote of their labels.","mlKnnVisualCaption":"Dashed circles: distance order. K=3 neighbors (purple) labels: 1, 2, 2 → majority 2","mlKnnVisualStep0":"① Training data — points in feature space (labels 1 or 2)","mlKnnVisualStep1":"② New point (?) appears — we predict its label","mlKnnVisualStep2":"③ Find distance to the K=3 nearest (dashed circles)","mlKnnVisualStep3":"④ Connect to K=3 neighbors — in order of distance","mlKnnVisualStep4":"⑤ Majority vote: labels 1, 2, 2 → predict 2","mlLinearRegressionVisualIntro":"Find the line $\\hat{y} = w x + b$ that best fits the data points.","mlLinearRegressionVisualStep0":"① Training data — (x, y) scatter plot","mlLinearRegressionVisualStep1":"② Wrong initial line — before gradient descent","mlLinearRegressionVisualStep2":"③ Line learns and moves to optimal position","mlLinearRegressionVisualStep3":"④ Learning complete — predict $\\hat{y}$ from new $x$","mlLinearRegressionVisualCaption":"$$y \\approx 0.7x + 1.1$ — $w$, $b$ learned by gradient descent","mlLinearRegressionVisualLearningBadge":"Learning...","mlLinearRegressionVisualPlay":"Watch line learning process","mlLinearRegressionVisualReplay":"Replay","mlLinearRegressionProblemSolvingLabel":"Explanation for solving the problems","mlMseVisualIntro":"MSE is the average of squared errors between prediction $\\hat{y}$ and actual $y$.","mlMseVisualStep0":"① Data points and prediction line $\\hat{y} = w x + b$","mlMseVisualStep1":"② Error (residual) bars from each point to the line","mlMseVisualStep2":"③ Squared errors $(y_i - \\hat{y}_i)^2$ visualized","mlMseVisualStep3":"④ MSE $= \\frac{1}{n}\\sum_i (y_i - \\hat{y}_i)^2$","mlMseVisualCaption":"MSE $= \\frac{1}{n}\\sum_i (y_i - \\hat{y}_i)^2$ — the smaller the loss, the better the line fits the data.","mlMseVisualSquaresLabel":"Squared error = area (side length = |residual|)","mlMseProblemSolvingLabel":"Explanation for solving the problems","mlLogisticProblemSolvingLabel":"Explanation for solving the problems","mlLogisticVisualIntro":"The larger the linear score $z$, the closer $\\sigma(z)$ is to 1, so we classify as class 1. $z=0$ is the decision boundary.","mlLogisticVisualCaption":"Sigmoid: $\\sigma(z) = \\frac{1}{1+e^{-z}}$. When $z>0$, $\\hat{y}=1$; when $z \\le 0$, $\\hat{y}=0$.","mlLogisticVisualFormulaExplain":"**How to read the formula** — When $z$ is large and negative, $e^{-z}$ is large so $\\sigma(z) \\approx 0$. When $z=0$, $\\sigma(0)=0.5$. When $z$ is large and positive, $e^{-z} \\approx 0$ so $\\sigma(z) \\approx 1$. So the formula squeezes any $z$ into a probability between 0 and 1.","mlLogisticVisualXAxisLabel":"z (linear score)","mlLinearRegressionProblemSolvingTable":"$1e","mlKnnProblemSolvingTable":"| Step | Description |\n| :--- | :--- |\n| **Input** | New feature vector $\\mathbf{x}$ |\n| **Stored** | Labeled examples $(\\mathbf{x}_i, y_i)$ |\n| **1** | Compute distance $d(\\mathbf{x}, \\mathbf{x}_i)$ to each $\\mathbf{x}_i$ |\n| **2** | Select K smallest distances |\n| **3 (classification)** | Predict $\\hat{y}$ by **majority vote** of the K labels |\n| **3 (regression)** | Predict $\\hat{y}$ as **average** of the K $y_i$ values |","mlDataFeature":{"chapter":"Chapter 00","title":"Data and Features: The Start of Machine Learning","description":"Machine learning starts with data. We turn images, text, and numbers into **features**—numeric representations that let the model learn patterns. The world of numbers and functions from Basic Math Ch00 becomes reality here.","sectionTitle":"What are Data and Features?","whatIs":{"0":"**Data is the raw material of machine learning** — As we learned in Basic Math Ch00, deep learning and machine learning turn images, text, and sound into **numbers**. These **numeric inputs** paired with **labels** (correct answers) form **data**. For example, 'cat image + cat' is one data point, and thousands of such pairs become the material for the model to learn from.","1":"**Features are the numeric essence of data** — A photo we see is just a pile of tens of thousands of pixel numbers to a computer. **Features** are the useful information—like ear shape, eye size, fur color—extracted and expressed as numbers. Mathematically they are **vectors**, extracted from raw data through **functions**. The 'functions that define input-output rules' from Ch00 handle this transformation.","2":"**In short** — Data is a collection of (input, label) pairs; features are the result of turning that input into **numeric vectors** the model can understand. Good features lead to better learning; bad features hurt performance even with lots of data. The start of machine learning is deciding what data to use and what features to extract."},"whyImportant":{"0":"**Without data, learning is impossible** — Every decision a model makes is the result of **numbers and functions**. As in Ch00, to follow the AI computation we need data expressed as **numbers**. If data is scarce or labels are wrong, the model learns the wrong patterns.","1":"**Feature design sets the model's limits** — Deciding which information to turn into numbers is called **feature engineering**. Using only 'yesterday's closing price' vs. adding 'moving average, volume, volatility' for stock prediction leads to very different results. **Vectors and matrices** bundle many features for batch computation—a core part of the Ch00 roadmap—and the quality of features drives model performance.","2":"**Bridge to the next chapters** — Ch02 KNN, Ch03 Linear Regression, Ch05 Logistic Regression, and all ML algorithms take **feature vectors** as input. Understanding data and features is needed to interpret why a model made a given prediction, and the later chapters on **differentiation** and **probability** build on this foundation."},"howUsed":{"0":"**Input → feature extraction → model → prediction** — The ML pipeline matches the **input → numeric conversion → repeated functions → output** structure from Ch00. Feature extraction is the 'numeric conversion' step; models (linear regression, KNN, etc.) are sets of **functions**. **Differentiation** is used to reduce error during training; **probability** expresses uncertainty in predictions like '90% chance this image is a cat'."},"problemSolving":{"0":"This chapter summarized the role of **data** and **features** in machine learning and how they are used in practice. Data is a collection of (input, label) pairs; features are the result of turning that input into **numeric vectors** the model can use. **Feature engineering**—choosing and designing good features—strongly affects performance, so it helps to solidify these ideas before moving on to the next chapters (KNN, linear regression, etc.).","1":"| Concept | Role in data/features | Basic math link |\n| --- | --- | --- |\n| **Data** | Collection of (input, label) pairs, expressed as numbers | Domain and codomain of functions (Ch01) |\n| **Features** | Input converted to vectors; model input | Vectors, matrices (Ch00 roadmap) |\n| **Training** | Adjusting model parameters from data | Differentiation, gradient (Ch06–08) |\n| **Prediction** | Feature vector → model → prediction or probability | Probability, distributions (Ch10–12) |"}},"mlSupervisedUnsupervisedSelf":{"chapter":"Chapter 01","title":"Supervised, Unsupervised, and Self-Supervised Learning","description":"Machine learning is often divided into **supervised**, **unsupervised**, and **self-supervised** learning depending on how data is used. **Supervised learning** is like studying with an answer key; **unsupervised learning** is like finding patterns and grouping similar items without labels; **self-supervised learning** is like masking part of the data and learning by predicting the missing part. This chapter summarizes the core ideas, math, and real-world use of these three paradigms so you can build a solid base for the algorithms covered later.","sectionTitle":"Three Ways of Learning: Supervised, Unsupervised, Self-Supervised","whatIs":{"0":"**Supervised Learning: Learning from input–label pairs**\nThe model is given **input $\\mathbf{x}$** and the corresponding **label (target) $y$** as pairs. The goal is to approximate a function $y = f(\\mathbf{x})$. Formally we have a training set $\\mathcal{D} = \\{(\\mathbf{x}_1, y_1), (\\mathbf{x}_2, y_2), \\ldots\\}$ and find $f$ by **minimizing a loss** (e.g. MSE, cross-entropy). Ch02 KNN, Ch03 Linear Regression, Ch04 Logistic Regression are all supervised.\n* **Example 1 (classification)**: Spam filter—email content ($\\mathbf{x}$) → spam or not ($y$).\n* **Example 2 (regression)**: House price—area, location ($\\mathbf{x}$) → price ($y$).\n* **Example 3 (medical)**: Patient test values ($\\mathbf{x}$) and diagnosis ($y$) for decision support.","1":"**Unsupervised Learning: Discovering hidden structure**\nOnly **input $\\mathbf{x}$** is given; there is **no label $y$**. Think of it as \"only questions, no answer key.\" The goal is to find **structure, patterns, or clusters** using **distance and similarity** between $\\mathbf{x}$s: group similar points (clustering), compress to fewer dimensions (dimensionality reduction), or flag **anomalies** that fall outside the normal pattern.\n* **Example 1 (clustering)**: Customer age and purchase history ($\\mathbf{x}$) → segment similar customers.\n* **Example 2 (anomaly detection)**: Learn normal payment patterns ($\\mathbf{x}$), then flag unusual transactions.\n* **Example 3 (dimension reduction)**: Reduce many features to 2–3 numbers for visualization or denoising. (You’ll learn concrete methods later.)","2":"**Self-Supervised Learning: Creating targets from data**\nInstead of human labels, the model creates **pseudo-labels** from the data. Typical flow: (1) **Mask** part of the input (e.g. a word, an image patch). (2) **Predict** the masked part from the rest. (3) **Use** the learned representation for downstream tasks with a small amount of supervised data. This is how BERT, GPT, and many vision models are pre-trained on large unlabeled corpora.\n* **Example 1 (language)**: \"I ate [ MASK ]\" → predict the masked word from context (LLMs).\n* **Example 2 (vision)**: Mask a region of an image and reconstruct it from the rest.\n* **Example 3 (contrastive)**: Treat two augmented views of the same image as \"same\" and different images as \"different\" to learn representations."},"whyImportant":{"0":"**Data nature and cost** — Building labels for all data is expensive. When labels are sufficient, **supervised** is effective; when they are scarce, **unsupervised** or **self-supervised** use unlabeled data, then a small supervised fine-tuning step. **Interpretability** also differs: supervised allows some explanation via loss and decision path; unsupervised/self-supervised require separate interpretation (e.g. cluster names, visualization).","1":"**Pre-training and fine-tuning** — Modern pipelines often use **self-supervised** pre-training on large unlabeled data, then **supervised** fine-tuning on a small labeled set. **Unsupervised** is common in preprocessing and exploration—e.g. cluster customers with K-Means, assign human meanings to clusters (e.g. \"loyal\", \"churn risk\"), then build a supervised churn model. Choosing the right paradigm makes the pipeline clear and realistic given data size and label cost."},"howUsed":{"0":"**Supervised** — Ch02 KNN, Ch03 Linear Regression, Ch04 Logistic Regression learn from (input, label) pairs. **Classification**: spam filter, disease prediction, image classification. **Regression**: house price, sales, temperature—Ch03/Ch04 cover the math and optimization.","1":"**Unsupervised** — Ch08 K-Means clusters data without labels; **dimension reduction** (reducing many features to 2–3 numbers) is another key tool. **Clustering**: customer segmentation, topic grouping. **Anomaly detection**: learn a \"normal\" region, flag points outside it.","2":"**Self-supervised** — BERT (masked word prediction), GPT (next-token prediction), and **contrastive learning** in vision are widely used. After pre-training, a small amount of labeled data is used for QA, summarization, or classification."},"problemSolving":{"0":"**Summary** — (1) **Supervised**: learn $y=f(\\mathbf{x})$ from $(\\mathbf{x},y)$ pairs. (2) **Unsupervised**: find structure/clusters from $\\mathbf{x}$ only. (3) **Self-supervised**: learn from pseudo-labels (e.g. masked tokens), then use small supervised data for downstream tasks.","1":"| | Supervised | Unsupervised | Self-Supervised |\n| :--- | :--- | :--- | :--- |\n| **Label** | Yes ($y$) | No | Self-created target |\n| **Goal** | Predict $y$ (classification/regression) | Structure, clusters, dimensionality reduction | Representation learning |\n| **Examples** | KNN, linear/logistic regression | K-Means, dimension reduction | BERT, contrastive learning |","2":"**By problem type** — **Definition**: supervised = (x,y) pairs; unsupervised = no label; self-supervised = self-created target. **Task**: Human-provided labels? → Supervised. No labels, only grouping/reduction? → Unsupervised. Labels derived from data (e.g. masked word)? → Self-supervised. **Scenarios**: spam classification (supervised), customer clustering (unsupervised), predict masked word (self-supervised).","3":"**One-line comparison** — Supervised: \"Learn from (question, answer) pairs.\" Unsupervised: \"No answers—only group or reduce the data.\" Self-supervised: \"Mask part of the data and predict the gap to learn representations.\" In problems, check whether labels exist and whether they are human-provided or data-derived to choose the type."},"mlSupervisedUnsupervisedSelfVisualIntro":"Three learning paradigms: supervised (input–label pairs), unsupervised (no label), self-supervised (self-created target).","mlSupervisedUnsupervisedSelfVisualStep0":"Supervised: learn a prediction function from (input, label) pairs","mlSupervisedUnsupervisedSelfVisualStep1":"Unsupervised: discover structure and clusters without labels","mlSupervisedUnsupervisedSelfVisualStep2":"Self-supervised: learn representations from self-created targets","mlSupervisedUnsupervisedSelfProblemSolvingLabel":"Problem-solving guide","mlSupervisedUnsupervisedSelfVisualPhase0Title":"Supervised: input x and label y come in pairs","mlSupervisedUnsupervisedSelfVisualPhase0Caption":"When (x, y) pairs are given in order, the model learns the rule","mlSupervisedUnsupervisedSelfVisualPhase1Title":"Unsupervised: only input x (no label y)","mlSupervisedUnsupervisedSelfVisualPhase1Caption":"There is no y (label), only x. Some x blink on and off → the model still finds structure and clusters","mlSupervisedUnsupervisedSelfVisualPhase1NoLabelBadge":"No label","mlSupervisedUnsupervisedSelfVisualPhase2Title":"Self-supervised: mask part of the data and predict the gap","mlSupervisedUnsupervisedSelfVisualPhase2Caption1":"Mask part of the input","mlSupervisedUnsupervisedSelfVisualPhase2Caption2":"Model predicts the masked part","mlSupervisedUnsupervisedSelfVisualPhase2Caption3":"The gap is filled with the predicted word","mlSupervisedUnsupervisedSelfVisualPhase2Prefix":"I ","mlSupervisedUnsupervisedSelfVisualPhase2Suffix":" ate","mlSupervisedUnsupervisedSelfVisualPhase2Filled":"rice","mlSupervisedUnsupervisedSelfVisualPhase2Example":"e.g. fill in the blank → representation learning (BERT, etc.)","mlSupervisedUnsupervisedSelfVisualPhase2Step1":"Mask","mlSupervisedUnsupervisedSelfVisualPhase2Step2":"Predict","mlSupervisedUnsupervisedSelfVisualPhase2Step3":"Fill","mlSupervisedUnsupervisedSelfVisualAutoCycle":"All three types animate at the same time","problemAnswerHint":"Enter 1 for supervised, 2 for unsupervised, 3 for self-supervised.","problems":{"definition_1_0":"Learning from data where input and label (answer) are paired is which type? ①Supervised ②Unsupervised ③Self-supervised","definition_1_1":"Learning $y=f(\\mathbf{x})$ from (input $\\mathbf{x}$, label $y$) pairs is which type? ①Supervised ②Unsupervised ③Self-supervised","definition_1_2":"The learning type likened to a teacher marking answers with a red pen is? ①Supervised ②Unsupervised ③Self-supervised","definition_1_3":"Learning that uses human-provided labels for classification or regression is? ①Supervised ②Unsupervised ③Self-supervised","definition_1_4":"The main learning type that learns classification or regression from (input, label) pairs is? ①Supervised ②Unsupervised ③Self-supervised","definition_1_5":"Learning where the data comes with a target and the model is trained to match it is? ①Supervised ②Unsupervised ③Self-supervised","definition_2_0":"Learning that finds structure, patterns, or clusters from input only, without labels, is? ①Supervised ②Unsupervised ③Self-supervised","definition_2_1":"When there is no label $y$, only $\\mathbf{x}$, finding groups in the data is which type? ①Supervised ②Unsupervised ③Self-supervised","definition_2_2":"Clustering similar data without labels corresponds to which learning type? ①Supervised ②Unsupervised ③Self-supervised","definition_2_3":"The learning type likened to finding and grouping types by yourself is? ①Supervised ②Unsupervised ③Self-supervised","definition_2_4":"Label-free learning often used for dimensionality reduction or anomaly detection is? ①Supervised ②Unsupervised ③Self-supervised","definition_2_5":"Discovering structure in data without human-provided answers is which type? ①Supervised ②Unsupervised ③Self-supervised","definition_3_0":"Learning from a 'pseudo-label' created from the data itself is? ①Supervised ②Unsupervised ③Self-supervised","definition_3_1":"Learning that creates its own target (e.g. masked word, next sentence) is? ①Supervised ②Unsupervised ③Self-supervised","definition_3_2":"Learning by masking part of a sentence and predicting that part is? ①Supervised ②Unsupervised ③Self-supervised","definition_3_3":"The paradigm used to learn representations from large unlabeled data is? ①Supervised ②Unsupervised ③Self-supervised","definition_3_4":"The learning type likened to making your own practice test and solving it is? ①Supervised ②Unsupervised ③Self-supervised","definition_3_5":"Learning that creates 'same vs. different' pairs by itself to learn representations is? ①Supervised ②Unsupervised ③Self-supervised","taskClassify_0":"Spam vs. non-spam classification (with labels) is which learning type? ①Supervised ②Unsupervised ③Self-supervised","taskClassify_1":"Grouping similar customers from purchase data only, with no labels, is? ①Supervised ②Unsupervised ③Self-supervised","taskClassify_2":"Predicting masked words in sentences to learn word representations is? ①Supervised ②Unsupervised ③Self-supervised","taskClassify_3":"Predicting apartment price from size and location is? ①Supervised ②Unsupervised ③Self-supervised","taskClassify_4":"Grouping similar images with no labels (clustering) is? ①Supervised ②Unsupervised ③Self-supervised","taskClassify_5":"Pre-training on large text then fine-tuning with few labels—the pre-training stage is? ①Supervised ②Unsupervised ③Self-supervised","taskClassify_6":"Building a disease-prediction model from medical images and disease labels is? ①Supervised ②Unsupervised ③Self-supervised","taskClassify_7":"Customer segmentation by grouping similar customers only, with no labels, is? ①Supervised ②Unsupervised ③Self-supervised","taskClassify_8":"Learning context representations by predicting the next sentence is? ①Supervised ②Unsupervised ③Self-supervised","taskClassify_9":"Predicting exam score from study time is? ①Supervised ②Unsupervised ③Self-supervised","taskClassify_10":"Anomaly detection when only normal data exists and almost no anomaly labels is closest to? ①Supervised ②Unsupervised ③Self-supervised","taskClassify_11":"Learning representations by predicting a masked part of an image from the rest is? ①Supervised ②Unsupervised ③Self-supervised","scenario_0":"A hospital trains a model on past patient data (symptoms, tests) and diagnosis (label) to predict 'Does this patient have disease A?' This is? ①Supervised ②Unsupervised ③Self-supervised","scenario_1":"A store splits customers into groups using only purchase history, with no extra labels. This is? ①Supervised ②Unsupervised ③Self-supervised","scenario_2":"A model is trained by masking 15% of words in Wikipedia and predicting them. This is? ①Supervised ②Unsupervised ③Self-supervised","scenario_3":"A model predicts tomorrow's sales from weather, date, and past sales (label). This is? ①Supervised ②Unsupervised ③Self-supervised","scenario_4":"Video data is indexed by grouping similar scenes with no labels. This is? ①Supervised ②Unsupervised ③Self-supervised","scenario_5":"Context is learned by predicting 'next sentence' on large documents, then fine-tuned with few QA labels. The first stage is? ①Supervised ②Unsupervised ③Self-supervised","scenario_6":"A classifier is trained on dog/cat images with species labels. This is? ①Supervised ②Unsupervised ③Self-supervised","scenario_7":"Stock price series only, no labels; patterns are split into segments. This is? ①Supervised ②Unsupervised ③Self-supervised","scenario_8":"Same sentence in different wording; 'same meaning' is used as target to learn representations. This is? ①Supervised ②Unsupervised ③Self-supervised","scenario_9":"An application (experience, education) and pass/fail (label) are used to build a pass-prediction model. This is? ①Supervised ②Unsupervised ③Self-supervised","scenario_10":"News articles only, no topic labels; articles are grouped by topic. This is? ①Supervised ②Unsupervised ③Self-supervised","scenario_11":"Speech representations are learned by masking and reconstructing parts of audio. This is? ①Supervised ②Unsupervised ③Self-supervised","trueFalse_0":"\"Learning from data where input and label are paired\" describes supervised learning. Which type is this? ①Supervised ②Unsupervised ③Self-supervised","trueFalse_1":"\"Finding only structure in data without labels\" describes unsupervised learning. Which type is this? ①Supervised ②Unsupervised ③Self-supervised","trueFalse_2":"\"Learning from a target created from the data (e.g. masked word)\" describes self-supervised learning. Which type is this? ①Supervised ②Unsupervised ③Self-supervised","trueFalse_3":"Fitting a function to predict a value from (input, label) pairs. Which learning type? ①Supervised ②Unsupervised ③Self-supervised","trueFalse_4":"Splitting data into K groups using only the data, no labels. Which learning type? ①Supervised ②Unsupervised ③Self-supervised","trueFalse_5":"Learning by predicting masked words in a sentence. Which learning type? ①Supervised ②Unsupervised ③Self-supervised","trueFalse_6":"Learning from human-provided pass/fail labels. Which learning type? ①Supervised ②Unsupervised ③Self-supervised","trueFalse_7":"\"Grouping similar items from data only, with no answers\" describes unsupervised learning. Which type is this? ①Supervised ②Unsupervised ③Self-supervised","trueFalse_8":"Learning representations from self-created 'same/different' pairs. Which learning type? ①Supervised ②Unsupervised ③Self-supervised","trueFalse_9":"Using (input, label) pairs at training time and predicting the label for new input. Which learning type? ①Supervised ②Unsupervised ③Self-supervised","trueFalse_10":"In anomaly detection, learning a 'normal region' from normal data only is closest to unsupervised. Which type is this? ①Supervised ②Unsupervised ③Self-supervised","trueFalse_11":"\"Learning context by predicting the next sentence\" is self-supervised. Which type is this? ①Supervised ②Unsupervised ③Self-supervised"}},"mlKnn":{"chapter":"Chapter 02","title":"K-Nearest Neighbors (KNN): Birds of a Feather","description":"**Birds of a feather flock together** — KNN finds the **K nearest** stored examples and uses their labels (majority vote) to predict the new one. No fancy training; just **distance** and neighbors.","sectionTitle":"K-Nearest Neighbors (KNN): Birds of a Feather","whatIs":{"0":"**What is KNN?** — For a new data point, we pick the **K closest** points among labeled data and assign the **majority label**. Example: if 4 of the 5 nearest emails are spam, the new email is classified as spam.","1":"**'Closest' means distance in feature space** — Usually **Euclidean distance**: $d(\\mathbf{x}, \\mathbf{y}) = \\sqrt{\\sum_{i}(x_i - y_i)^2}$. With two features, this is the straight-line distance on the plane.","2":"**K is a hyperparameter** — K=1 uses only the single nearest neighbor; larger K smooths the decision but can blur boundaries. Odd K is often used to avoid ties."},"whyImportant":{"0":"**No explicit training (lazy learning)** — KNN does not learn a compact model; at prediction time it computes distances to all stored points. Training cost is low; prediction cost can be high.","1":"**Interpretable** — We can explain a prediction by showing the K neighbors (e.g. \"spam because 4 of 5 similar emails were spam\"), which supports explainable AI.","2":"**Useful as a baseline** — Before trying complex models, KNN gives a quick sense of how well the data can be classified."},"howUsed":{"0":"**Classification** — Majority vote among the K neighbors' labels. Used in image classification, spam detection, risk bands, etc.","1":"**Regression** — Predict the **average** of the K neighbors' target values (e.g. house price from nearby sales).","2":"**Distance and scale** — If features have different scales, distance is dominated by one feature. **Normalization** or **standardization** is recommended before computing distances."},"problemSolving":{"0":"KNN works by selecting the **K closest** stored examples to a new input, then using **majority vote** of their labels for classification or the **average** of their values for regression. There is no separate training step—only distance computation—so it is intuitive, but **normalization (scaling)** is important so that no single feature dominates the distance.","1":"| Step | Description |\n| :--- | :--- |\n| **Input** | New feature vector $\\mathbf{x}$ |\n| **Stored** | Labeled examples $(\\mathbf{x}_i, y_i)$ |\n| **1** | Compute distance $d(\\mathbf{x}, \\mathbf{x}_i)$ to each $\\mathbf{x}_i$ |\n| **2** | Select K smallest distances |\n| **3 (classification)** | Predict $\\hat{y}$ by **majority vote** of the K labels |\n| **3 (regression)** | Predict $\\hat{y}$ as **average** of the K $y_i$ values |"},"problemSolvingTable":"| Step | Description |\n| :--- | :--- |\n| **Input** | New feature vector $\\mathbf{x}$ |\n| **Stored** | Labeled examples $(\\mathbf{x}_i, y_i)$ |\n| **1** | Compute distance $d(\\mathbf{x}, \\mathbf{x}_i)$ to each $\\mathbf{x}_i$ |\n| **2** | Select K smallest distances |\n| **3 (classification)** | Predict $\\hat{y}$ by **majority vote** of the K labels |\n| **3 (regression)** | Predict $\\hat{y}$ as **average** of the K $y_i$ values |"},"mlLinearRegression":{"chapter":"Chapter 03","title":"Linear Regression: A Line Through the Data","description":"When data points are scattered, **linear regression** finds the **line that best fits** their trend and predicts values for new inputs. It is the first regression model where you can see how **functions**, **derivatives**, and **partial derivatives** from Basic Math lead directly to machine learning 'training'.","sectionTitle":"Linear Regression: A Line Through the Data","whatIs":{"0":"**What is linear regression?** — We assume a **linear relationship** $y = w_1 x + w_0$ (or $y = \\mathbf{w}^\\top \\mathbf{x} + b$ for multiple variables) between input $x$ and output $y$, and find the **weights $w$ and intercept $b$** that best fit the data. The **function** $y = f(x)$ from Basic Math Ch01 is here a concrete **linear function**.","1":"**What does 'best fit' mean?** — We minimize the **error** between predictions $\\hat{y}_i = w x_i + b$ and actual values $y_i$. The function that measures this error is the **loss function**; **MSE (Mean Squared Error)**, covered in Ch04, is the most common.","2":"**Difference from KNN** — KNN predicted by the 'average of neighbors'; linear regression learns and stores **one formula (a line)**. At prediction time, we only compute $\\hat{y} = w x + b$ without searching for neighbors."},"whyImportant":{"0":"**First application of differentiation and optimization** — To minimize error, we use **differentiation** (Basic Math Ch06). Following the **gradient** of the loss with respect to $w$ and $b$ leads to the minimum. This is **gradient descent**, the same principle behind deep learning training.","1":"**Interpretability** — The learned $w$ tells us 'how much $y$ changes when $x$ increases by 1'. For example, with house area ($x$) and price ($y$), $w > 0$ means 'larger area, higher price'—matching intuition. This **interpretability** matters when trusting and improving models in practice.","2":"**Foundation for other models** — Logistic regression (Ch05), a single neuron in a neural network—all use 'linear transformation + nonlinear function'. Understanding linear regression clarifies how their **linear part** works."},"howUsed":{"0":"**Regression** — Used to predict **continuous numbers**: house prices, sales, temperature, scores. With multiple features, $y = w_1 x_1 + w_2 x_2 + \\cdots + w_n x_n + b$ becomes **multiple linear regression**.","1":"**Feature importance** — Features with larger $|w_i|$ have more influence on predictions. When doing feature engineering (Ch01), we use these values to decide which features to keep or drop.","2":"**Normal equation vs gradient descent** — With few features, the **normal equation** gives the optimal solution in one step. With many features or large data, **gradient descent** updates $w$ iteratively. **Partial derivatives and gradients** from Basic Math Ch08 are the key tools here."},"visual":"","problemSolving":{"0":"**Summary: A process of trial and error that reduces error** — Linear regression is like a detective finding **one single line** ($y=wx+b$) that best passes through scattered data points. **Model (assumption)**: We start by drawing a random line. Of course it doesn't fit the data well, so the **error** is large. **Learning**: We use gradient descent to reduce this error—like walking down a mountain with eyes closed, step by step, toward the lowest valley (the point of minimum error). **Prediction**: Once we reach the valley floor, we've found the optimal slope ($w$) and position ($b$). Now when a new question ($x$) arrives, we simply plug it into this finished formula to predict the answer ($\\hat{y}$) instantly.","1":"**Three steps: extracting a rule from data** — Linear regression finds a **simple rule** ($y=wx+b$) within complex data.\n\n**① Model** — We assume \"input ($x$) and target ($y$) have a linear relationship\" and set up the model.\n\n**② Optimization (training)** — We compute the **loss** (the difference between prediction $\\hat{y}$ and actual $y$), then use gradient descent to update $w$ (slope) and $b$ (intercept) little by little to minimize it. This is exactly the same principle as deep learning.\n\n**③ Inference (prediction)** — The learned line compresses the data's pattern. When new data arrives, we substitute it into the line formula and predict the result instantly."}},"mlMse":{"chapter":"Chapter 04","title":"Loss Function (MSE): Measuring Prediction Error","sectionTitle":"Loss Function (MSE): Measuring Prediction Error","description":"When finding the **'best-fitting line'** in linear regression, we need a single number that says how far predictions are from the truth. The **Sum of Squared Errors (SSE)** is the sum of $(y - \\hat{y})^2$ over all points. Dividing SSE by the number of data points gives the **Mean Squared Error (MSE)**. The closer MSE is to zero, the better the model fits the data—and gradient descent minimizes this MSE.","whatIs":{"0":"**The ruler for error** — We need a **loss function** that summarizes how wrong the model is. At each point, the difference between actual $y$ and prediction $\\hat{y}$ is the **residual** (or error). Squaring each residual and adding them up gives the **Sum of Squared Errors (SSE)**. Dividing SSE by the number of points $n$ gives the **Mean Squared Error (MSE)**: $\\text{MSE} = \\frac{1}{n}\\sum_i (y_i - \\hat{y}_i)^2 = \\text{SSE}/n$. The smaller this value, the better the model fits.","1":"**Why square?** — A residual of $+2$ or $-2$ both mean 'off by 2'. If we summed raw residuals, $+2 + (-2) = 0$ and they would cancel. **Squaring** keeps everything positive and penalizes large errors more.","2":"**Link to linear regression** — The line $\\hat{y} = w x + b$ from Ch03 'fits the data best' when **MSE** (or equivalently **SSE**) is minimized. Gradient descent updates the slope $w$ and intercept $b$ in the direction that reduces MSE."},"whyImportant":{"0":"**It defines the learning goal** — Machine learning is often summarized as 'minimize the loss'. For regression, when that loss is MSE, the model moves only in directions that lower MSE, so the **objective is clear**.","1":"**Differentiation is easy** — The square function has a simple derivative, so gradient descent with MSE is tractable. Deep learning also uses squared-error-style losses widely.","2":"**RMSE: back to the original units** — Because MSE averages **squared** errors, its unit is '$y$ squared' (e.g. dollars² for price prediction). In practice we often want to say “on average we’re off by so many dollars or degrees.” Taking the square root of MSE gives **RMSE (Root Mean Squared Error)**: $\\sqrt{\\text{MSE}}$, which has the same units as $y$. Once you understand MSE, RMSE follows naturally."},"howUsed":{"0":"**Training regression models** — Linear regression, neural network regression, etc. compute MSE on the training data and update parameters to reduce it.","1":"**Comparing models** — To compare which line (or model) fits the data better, compute MSE for each; the **smaller** value wins.","2":"**Validation and test** — After training, computing MSE on unseen data (validation/test set) gives an **objective measure of generalization**."},"visual":"...","problemSolving":{"0":"$1f"}},"mlLogistic":{"chapter":"Chapter 05","title":"Logistic Regression: Pass or Fail?","description":"Where linear regression predicts a 'score', **logistic regression** is the specialist for **yes/no** classification—e.g. \"Will this score mean **pass (1)** or **fail (0)**?\" It uses the **sigmoid function** to turn a score into a probability between 0 and 1.","sectionTitle":"Logistic Regression: Pass or Fail?","whatIs":{"0":"**The S-curve: sigmoid** — The score $z$ from a linear model can be large or negative. Probabilities must lie between 0 and 1. The **sigmoid** $\\sigma(z) = \\frac{1}{1+e^{-z}}$ maps any real $z$ into (0, 1).","1":"**Decision boundary** — When the sigmoid outputs e.g. \"probability of pass = 0.7\", we need a rule. Usually we use **0.5**: if probability ≥ 0.5 we predict **1 (yes)**, otherwise **0 (no)**.","2":"**Same core as linear regression** — Logistic regression still computes a score $z = wx + b$ first; the only difference is passing that score through the **sigmoid** to get a probability.","3":"**How to read $\\sigma(z) = \\frac{1}{1+e^{-z}}$** — When $z$ is large and negative, $e^{-z}$ is large so $\\sigma(z) \\approx 0$. When $z=0$, $\\sigma(0)=0.5$. When $z$ is large and positive, $e^{-z} \\approx 0$ so $\\sigma(z) \\approx 1$. So any $z$ is squeezed into a probability in [0, 1]."},"whyImportant":{"0":"**Many real problems are yes/no** — Spam or not? Disease or not? Will the user buy? **Binary classification** is everywhere; logistic regression is the standard baseline.","1":"**Confidence as a number** — Saying \"pass with 98% probability\" is more useful than just \"pass\". Logistic regression gives a **probability**, which supports better decisions.","2":"**Bridge to deep learning** — A single neuron in a neural network behaves much like logistic regression. Mastering this makes deep learning easier later."},"howUsed":{"0":"**Spam filter** — Compute \"probability this email is spam\" from features; if above a threshold, send to spam.","1":"**Medical AI** — From X-rays or lab values, predict \"probability of disease\" to support diagnosis.","2":"**Marketing and recommendations** — Predict \"will this user churn?\" or \"will they click?\" for targeting and ads."},"visual":"","problemSolving":{"0":"**Logistic regression summary** — It is for **binary classification** (yes/no, pass/fail). We compute a linear score $z = w_1 x_1 + w_2 x_2 + \\cdots + b$, then apply the **sigmoid** $\\sigma(z) = \\frac{1}{1+e^{-z}}$ to get a probability. We predict $\\hat{y}=1$ if probability ≥ 0.5, else $\\hat{y}=0$ ($z=0$ is the decision boundary). It is important because many real tasks are binary; it also gives **confidence** (probability) and is the basis for understanding neurons in deep learning. Used in spam filters, medical decision support, and marketing (churn, click prediction). **Solution flow**: compute $z$ → $\\sigma(z)$ → if $z>0$ then $\\hat{y}=1$, else $\\hat{y}=0$. See the **Explanation for problem solving** block below for examples."}},"mlDecisionTree":{"chapter":"Chapter 06","title":"Decision Tree: Twenty Questions to the Answer","description":"A decision tree works like the game of **Twenty Questions**: ask yes/no questions, follow branches, and reach a prediction at a leaf. It is easy to interpret (you can see exactly why it made each decision) and is the building block for random forests and other ensemble methods.","sectionTitle":"Decision Tree: Twenty Questions to the Answer","whatIs":{"0":"**Basic structure** — Picture an upside-down tree. At the top is the **root node** (first question). From there you ask a condition (e.g. “Is feature $x_1 \\le 3$?”); **yes** and **no** lead to **internal nodes**. When you can’t split further, you reach a **leaf node** and output the **prediction** (class or value).","1":"**Same as Twenty Questions** — Just like guessing an animal by asking “Does it have four legs?” → “Is it a herbivore?” → “Tiger!”, the tree narrows down the answer step by step. Each question splits the data into two groups.","2":"**Good questions: reducing impurity** — **Impurity** measures how mixed the classes are at a node. We want splits that make nodes purer. Two common formulas: **Gini** $G = 1 - \\sum p_i^2$ and **Entropy** $H = -\\sum p_i \\log_2 p_i$. When one class has 100% ($p=1$), both are 0 (pure). When classes are half-and-half, impurity is high.","3":"**Information gain** — **Information gain** = impurity before the split minus (weighted) impurity after. It measures how much a question “cleans up” the data. The tree chooses the question with the highest information gain at each step.","4":"**Prediction at the leaf** — At a **leaf**, we output: for **classification**, the **majority class** of the samples there; for **regression**, the **average** of their target values. For new data, we just follow the path and read off the leaf’s prediction.","5":"**Pruning** — A tree that is too deep **overfits** (memorizes the training set). **Pruning** cuts branches to limit depth and improve generalization. These pruned trees are the base models used in **random forest** and other ensembles."},"whyImportant":{"0":"**Explainable AI** — Unlike many black-box models, a decision tree shows the exact path of questions that led to each prediction (e.g. “age < 30 and income ≥ 30M → approve loan”). This is valued in finance and healthcare.","1":"**Nonlinear boundaries** — Linear models cut the space with a single line; a tree can approximate **step-like** boundaries by repeated splits, capturing more complex patterns.","2":"**Foundation for ensembles** — A single tree can be unstable, but hundreds of trees (e.g. **random forest**) form a strong, robust model. Ch06 is the basis for Ch07 Ensemble."},"howUsed":{"0":"**Credit and loans** — Questions like “Income ≥ 50M?” and “Any default in the last year?” form a path to approve or deny.","1":"**Medical decision support** — Patient data (blood pressure, cholesterol, etc.) is used in a sequence of questions to predict disease risk and support diagnosis.","2":"**Marketing (churn, purchase)** — “Registered > 6 months?”, “Logins in the last month ≤ 3?” help find at-risk customers for targeted campaigns."},"problemSolving":{"0":"**Decision tree — solving guide** — (1) **Follow path**: Start at root; 0 = no/left, 1 = yes/right; the leaf’s prediction is the answer. (2) **Gini**: Get $p_i$ from class counts, compute $G = 1 - \\sum_i p_i^2$, then round $100 \\times G$.\n\n---\n\n(3) **Entropy**: $H = -\\sum_i p_i \\log_2 p_i$, then round $100 \\times H$.\n\n---\n\n(4) **Leaf majority**: If class 0 has $a$ and class 1 has $b$, predict 0 if $a \\ge b$, else 1. Node count, leaf count, depth: use the numbers given in the problem. See the **Explanation for solving the problems** table below."},"visual":""},"mlDecisionTreeProblemSolvingLabel":"Explanation for solving the problems","mlDecisionTreeVisualIntro":"From the root, follow branches by answering yes/no to each question; the leaf gives the prediction.","mlDecisionTreeVisualStep0":"① Root node — first question (e.g. is feature $x_1 \\le 3$?)","mlDecisionTreeVisualStep1":"② Move to left (0 = no) or right (1 = yes) child","mlDecisionTreeVisualStep2":"③ Repeat questions at internal nodes","mlDecisionTreeVisualStep3":"④ Leaf node — output prediction (class or value) with no further split","mlDecisionTreeVisualPathCaption0":"① Root node — ask the first question. Follow branches by yes/no.","mlDecisionTreeVisualPathCaption1":"④ Follow path: Yes(1) → Leaf 0","mlDecisionTreeVisualPathCaption2":"⑤ Follow path: No(0) → Leaf 1","mlDecisionTreeVisualStep0Description":"① Root node — at the first question, branch by yes/no and go down the left or right branch.","mlDecisionTreeVisualLabelRoot":"Root","mlDecisionTreeVisualLabelYes":"Yes(1)","mlDecisionTreeVisualLabelNo":"No(0)","mlDecisionTreeVisualLabelQuestion":"Question","mlDecisionTreeVisualLabelLeaf0":"Leaf 0","mlDecisionTreeVisualLabelLeaf1":"Leaf 1","mlDecisionTreeVisualDiagramAriaLabel":"Decision tree structure: root — question — leaf","mlEnsemble":{"chapter":"Chapter 07","title":"Ensemble and Random Forest: The Wisdom of the Crowd","description":"Ensemble methods combine predictions from multiple models to produce a single, often better prediction. This chapter explains bagging, boosting, stacking, and random forest—where many decision trees vote or average—so beginners can follow the idea of collective intelligence.","sectionTitle":"Ensemble and Random Forest: The Wisdom of the Crowd","whatIs":{"0":"**The core idea of ensemble: many hands make light work** — An ensemble builds a **team of multiple models** and combines their predictions to reach a final answer. Like a jury voting on a verdict, using many models instead of one sharply reduces the chance of wrong answers (variance) and makes predictions **more stable**. For classification we use **majority vote**; for regression we use the **average** of predictions.","1":"**Why are many better than one? (Wisdom of the crowd)** — If you ask 100 people to guess a cow's weight, individual guesses may be off, but the **average** of 100 guesses is often surprisingly close to the true weight. When models **independently** judge and we combine results, their random errors tend to cancel out and the **shared signal** remains.","2":"**Three main ensemble methods: Bagging, Boosting, Stacking** — (1) **Bagging**: Each model gets a different random subset of data (like different practice tests); then they vote. (2) **Boosting**: The next model focuses on what the previous one got wrong, learning **sequentially** from mistakes. (3) **Stacking**: A meta-model takes the reports of base models and makes the final decision.","3":"**Random Forest: a forest of diverse trees** — Bagging with **decision trees**: grow hundreds of trees. To keep them diverse, each tree is trained on a **random subset of features** at each split. Some trees rely on \"age\", others on \"income\", maximizing **diversity**.","4":"**Voting and averaging in a formula** — For classification, majority vote means \"the class that most trees chose\". For regression (e.g. house price), average all tree predictions: **$\\hat{y} = \\frac{1}{B}\\sum_{b=1}^B \\hat{y}_b$** where $B$ is the number of trees and $\\hat{y}_b$ is the $b$-th tree's prediction. (e.g. three trees predict 100, 150, 200 → final prediction 150)","5":"**OOB (Out-of-Bag) evaluation** — In bagging/random forest, each tree is trained on a random sample of the data. The **left-out samples (Out-of-Bag)** can be used to evaluate those trees that did not see them—like a built-in validation set without holding out separate test data."},"whyImportant":{"0":"**A stable forest that doesn't sway** — A single decision tree can change a lot when data changes slightly. A **forest** of hundreds of trees stays stable; a few wrong trees don't change the overall vote. This leads to strong, reliable performance in practice.","1":"**Natural extension of Ch06 Decision Tree** — The same tree structure (impurity, information gain) is reused. You're not learning new rules—just how to **combine** many trees with voting, so the previous chapter's knowledge is fully used.","2":"**The go-to model in industry and competitions** — Random forest often works very well with little tuning, so it's many practitioners' first choice. It also provides **feature importance**, which helps explain which variables matter most."},"howUsed":{"0":"**General-purpose for business (classification and regression)** — From \"Is this email spam?\" to \"What will tomorrow's stock price be?\", ensembles are used across almost every business problem.","1":"**Finding what matters (feature importance)** — If trees in a loan model rely most on \"income\", that variable is the most important for the decision. This helps filter out unnecessary data.","2":"**Wide real-world use** — Fraud detection, recommendation systems (e.g. Netflix, YouTube), equipment failure prediction—wherever accuracy and stability matter."},"problemSolving":{"0":"**Ensemble / Random forest — solving guide** — (1) **Majority vote**: Compare votes for class 0 vs class 1; the **majority** is the final prediction (0 or 1).\n\n---\n\n(2) **Vote count**: The number of votes for the winning class.\n\n---\n\n(3) **Regression mean**: Sum of tree predictions divided by number of trees; round if needed.\n\n---\n\n(4) **OOB**: Number of trees whose bootstrap sample did **not** include this sample.\n\n---\n\n(5) **Formula**: $\\hat{y} = \\frac{1}{B}\\sum_{b=1}^B \\hat{y}_b$ — $B$ is number of trees, $\\hat{y}_b$ the $b$-th prediction. Divide sum by $B$ for the mean. See the **Explanation for solving the problems** table below."},"visual":""},"mlEnsembleVisualIntro":"Combine predictions from multiple models (trees) by voting or averaging to get the final prediction.","mlEnsembleVisualStep0":"① Draw bootstrap samples from training data and train multiple trees","mlEnsembleVisualStep1":"② Each tree predicts independently","mlEnsembleVisualStep2":"③ Classification: majority vote; Regression: average → final prediction","mlEnsembleVisualStep3":"④ The final prediction is determined","mlEnsembleVisualLabelData":"Data","mlEnsembleVisualLabelVote":"Vote/Average","mlEnsembleVisualLabelPrediction":"Prediction","mlEnsembleVisualLabelTree1":"Tree 1","mlEnsembleVisualLabelTree2":"Tree 2","mlEnsembleVisualLabelTree3":"Tree 3","mlEnsembleVisualAriaLabel":"Ensemble flow: Data → Trees → Vote/Average → Prediction","mlKmeansProblemSolvingLabel":"Explanation for solving problems","mlKmeansVisualIntro":"Assign each point to the nearest center, then move centers to the mean of assigned points; repeat.","mlKmeansVisualStep0":"① Data — unlabeled points in feature space","mlKmeansVisualStep1":"② Initialize K centers — place K centroids","mlKmeansVisualStep2":"③ Assign — assign each point to the nearest center (by color)","mlKmeansVisualStep3":"④ Update centers — set each center to the mean of its assigned points","mlKmeansVisualStep4":"⑤ Repeat until assignment and centers no longer change","mlKmeansVisualCaption":"K-Means: repeat assign → update to minimize SSE (distortion).","mlKmeansVisualAriaLabel":"K-Means flow: Data → Initial centers → Assign → Update → Converge","mlKmeansVisualMeanLabel":"mean","mlKmeansVisualPointDataLabel":"Point: Data","mlKmeansVisualLineCaption":"Line: from each point to its assigned center (μ)","mlKmeansVisualCenterMoveCaption":"Centers move to cluster mean","mlCrossValidationProblemSolvingLabel":"Explanation for solving the problems","mlCrossValidationVisualIntro":"Split data into train/validation/test; in K-Fold, take turns validating and estimate performance by the mean score.","mlCrossValidationVisualTitle":"① 5-Fold","mlCrossValidationVisualFoldLabel":"Fold{n}","mlCrossValidationVisualTrainLabel":"Train","mlCrossValidationVisualValLabel":"Validation","mlCrossValidationVisualScoreLabel":"Validation score","mlCrossValidationVisualMeanLabel":"Mean μ","mlCrossValidationVisualStep0":"① Full data — sample set for training and validation","mlCrossValidationVisualStep1":"② Train/Val/Test split — train to learn, validate to tune, test for final evaluation","mlCrossValidationVisualStep2":"③ K-Fold — split into K parts, use one part as validation and the rest for training each time","mlCrossValidationVisualStep3":"④ Per-fold validation scores — get $S_1, S_2, \\ldots, S_K$ from each fold","mlCrossValidationVisualStep4":"⑤ Mean $\\bar{S} = \\frac{1}{K}\\sum_{k=1}^K S_k$ — final performance estimate","mlCrossValidationVisualCaption":"Cross validation: practice tests (validation) to estimate skill, final exam (test) to confirm.","mlCrossValidationVisualAriaLabel":"Cross validation flow: data → split → K-Fold → per-fold scores → mean","mlCrossValidationProblemPrompt":"Read the instruction below and enter your answer in the blank (?).","mlCrossValidationProblemPromptDefinition":"If the following statement is true enter 1, otherwise 0. {statement}","mlCrossValidationProblemPromptDefinitionChoice":"Choose the option that matches the question. Enter 1, 2, or 3.\n\n{question}","mlCrossValidationProblemPromptHoldoutTrain":"With {n} samples and training ratio {trainRatio}, how many training samples? (integer)","mlCrossValidationProblemPromptHoldoutTest":"With {n} samples and training ratio {trainRatio}, how many test samples? (integer)","mlCrossValidationProblemPromptKfoldSize":"With {n} samples and {K}-Fold, what is the size of one fold (validation set)? (integer quotient)","mlCrossValidationProblemPromptKfoldScoreMean":"K-Fold validation scores (%) are {scores}. Find the mean (integer).","mlCrossValidationProblemPromptScenario":"Choose the most suitable method for the scenario. Enter 1 for Hold-out, 2 for K-Fold, 3 for Stratified K-Fold. {scenario}","mlCrossValidationProblemPromptStratified":"Choose the option that matches the question. Enter 1, 2, 3 or 1/0 for O/X.\n\n{question}","mlCrossValidationStatement_0":"Cross validation estimates performance by splitting data into train/validation/test instead of scoring only on training data.","mlCrossValidationStatement_1":"The validation set is used like a practice test for hyperparameter selection or model comparison.","mlCrossValidationStatement_2":"In K-Fold, data is split into K parts and each part is used once as validation; the mean of validation scores is the final estimate.","mlCrossValidationStatement_3":"The test set is used only once for final performance reporting.","mlCrossValidationStatement_4":"Hold-out splits data once into train and validation (or train and test).","mlCrossValidationStatement_5":"Overfitting is suspected when training score is high but validation/test score is low.","mlCrossValidationStatement_6":"The training set is the data used to learn model weights and parameters.","mlCrossValidationStatement_7":"In K-Fold, one fold size is usually the integer quotient of n/K.","mlCrossValidationStatement_10":"It is fine to report final performance on the validation set after training on it.","mlCrossValidationStatement_11":"Hold-out always gives more stable estimates than K-Fold.","mlCrossValidationStatement_12":"The test set can be used multiple times to choose models.","mlCrossValidationStatement_13":"Performance measured only on training data gives an accurate picture of generalization.","mlCrossValidationStatement_14":"In K-Fold, a larger K means fewer validation runs.","mlCrossValidationQuestionChoice_0":"The main purpose of cross validation is? ① Estimate generalization ② Speed up training ③ Data augmentation","mlCrossValidationQuestionChoice_1":"When data is limited, which is more advantageous? ① Hold-out ② K-Fold ③ Stratified only","mlCrossValidationQuestionChoice_2":"What corresponds to a practice test? ① Train ② Validation ③ Test","mlCrossValidationQuestionChoice_3":"Which keeps class proportions in each fold? ① Hold-out ② Plain K-Fold ③ Stratified K-Fold","mlCrossValidationQuestionChoice_4":"What corresponds to the final exam? ① Train ② Validation ③ Test","mlCrossValidationQuestionChoice_5":"Which set is used to choose hyperparameters? ① Train ② Validation ③ Test","mlCrossValidationQuestionChoice_6":"Which validates by using different splits multiple times? ① Hold-out ② K-Fold ③ Test only","mlCrossValidationQuestionChoice_7":"When might we suspect overfitting? ① High train and high validation ② High train and low validation ③ Low train and high validation","mlCrossValidationScenario_0":"We have 10,000 samples and want to evaluate once with a single split.","mlCrossValidationScenario_1":"We have only 500 samples and want a stable validation estimate by splitting multiple ways.","mlCrossValidationScenario_2":"We split once 80% train, 20% test and use the test set only once at the end.","mlCrossValidationScenario_3":"Classification with 90:10 class imbalance; we want each fold to preserve that ratio.","mlCrossValidationScenario_4":"We want to run validation 5 times and report the average accuracy.","mlCrossValidationScenario_5":"We split once 70:30 and use that split.","mlCrossValidationScenario_6":"We run K validation runs to reduce the variance of the estimate.","mlCrossValidationScenario_7":"Binary classification; we want to keep the positive rate in each fold.","mlCrossValidationStratified_0":"What is an advantage of Stratified K-Fold? ① Preserve class ratio ② Faster ③ Less memory","mlCrossValidationStratified_1":"For imbalanced classes in classification, what is recommended? ① Hold-out only ② Stratified K-Fold ③ Skip validation","mlCrossValidationStratified_2":"Stratified is mainly used for? ① Regression only ② Classification (preserve class ratio) ③ Clustering","mlEvaluationProblemPrompt":"Read the instruction below and enter your answer in the blank (?).","mlEvaluationProblemSolvingLabel":"Explanation for solving the problems","mlEvaluationVisualIntro":"Fill the 2×2 confusion matrix with actual (rows) and predicted (columns), then compute accuracy, precision, recall, and F1.","mlEvaluationVisualStep0":"① Actual vs predicted — rows: actual pos/neg, columns: predicted pos/neg","mlEvaluationVisualStep1":"② Confusion matrix — fill the four cells TP, TN, FP, FN","mlEvaluationVisualStep2":"③ Accuracy — (TP+TN)/total, fraction correct","mlEvaluationVisualStep3":"④ Precision & recall — precision: TP/(TP+FP), recall: TP/(TP+FN)","mlEvaluationVisualStep4":"⑤ F1 — harmonic mean of precision and recall","mlEvaluationVisualCaption":"Read the model's report card via the confusion matrix and choose metrics that match your goal.","mlEvaluationVisualAriaLabel":"Classification evaluation: confusion matrix → accuracy, precision, recall, F1","mlEvaluationVisualMatrixTitle":"Confusion Matrix (2×2)","mlEvaluationVisualStepLineTP":"Actual positive · Predicted positive → TP","mlEvaluationVisualStepLineFN":"Actual positive · Predicted negative → FN","mlEvaluationVisualStepLineFP":"Actual negative · Predicted positive → FP","mlEvaluationVisualStepLineTN":"Actual negative · Predicted negative → TN","mlEvaluationVisualPredPos":"Predicted positive","mlEvaluationVisualPredNeg":"Predicted negative","mlEvaluationVisualActualPos":"Actual positive","mlEvaluationVisualActualNeg":"Actual negative","mlEvaluationVisualBadgeTP":"True positive ✓","mlEvaluationVisualBadgeFN":"False negative (actual pos → predicted neg)","mlEvaluationVisualBadgeFP":"False positive (actual neg → predicted pos)","mlEvaluationVisualBadgeTN":"True negative ✓","mlEvaluationVisualBadgeFixed":"After distinguishing TP, FN, FP, TN, compute accuracy, precision, recall, and F1.","mlEvaluationProblemPromptDefinition":"If the following statement is true enter 1, otherwise 0.\n\n{statement}","mlEvaluationProblemPromptDefinitionChoice":"Choose the option that matches the question. Enter 1, 2, or 3.\n\n{question}","mlEvaluationProblemPromptScenario":"Choose the most suitable option for the scenario. Enter 1, 2, or 3.\n\n{scenario}","mlEvaluationProblemPromptConfusionCount":"With TP={tp}, TN={tn}, FP={fp}, FN={fn} in the confusion matrix, what is the value (integer) of {cell}?","mlEvaluationProblemPromptTotalCount":"With TP={tp}, TN={tn}, FP={fp}, FN={fn}, what is the total count n (integer)?","mlEvaluationProblemPromptAccuracy":"With TP={tp}, TN={tn}, FP={fp}, FN={fn}, what is accuracy (%) (integer)?","mlEvaluationProblemPromptPrecision":"With TP={tp}, TN={tn}, FP={fp}, FN={fn}, what is precision (%) (integer)?","mlEvaluationProblemPromptRecall":"With TP={tp}, TN={tn}, FP={fp}, FN={fn}, what is recall (%) (integer)?","mlEvaluationProblemPromptF1":"With TP={tp}, TN={tn}, FP={fp}, FN={fn}, what is F1 score (%) (integer)?","mlEvaluationStatement_0":"The confusion matrix is a 2×2 table of actual class (rows) and predicted class (columns).","mlEvaluationStatement_1":"Accuracy is (TP+TN) divided by total count.","mlEvaluationStatement_2":"The denominator of precision is TP+FP.","mlEvaluationStatement_3":"The denominator of recall is TP+FN.","mlEvaluationStatement_4":"F1 is the harmonic mean of precision and recall.","mlEvaluationStatement_5":"TP is the count of actual positive and predicted positive.","mlEvaluationStatement_6":"FN is actual positive but predicted negative (miss).","mlEvaluationStatement_7":"With imbalanced data, accuracy alone can be misleading.","mlEvaluationStatement_10":"Precision and recall are always equal.","mlEvaluationStatement_11":"High accuracy always means the model is suitable for production.","mlEvaluationStatement_12":"FP is actual positive but predicted negative.","mlEvaluationStatement_13":"The denominator of recall is TP+FP.","mlEvaluationStatement_14":"TN is actual positive and predicted positive.","mlEvaluationQuestionChoice_0":"The numerator of accuracy is? ① TP+TN ② TP+FP ③ TP+FN","mlEvaluationQuestionChoice_1":"The denominator of precision is? ① TP+FN ② TP+FP ③ TN+FN","mlEvaluationQuestionChoice_2":"When is recall important? ① Allowing spam as normal ② When we must not miss disease ③ Minimizing false alarms","mlEvaluationQuestionChoice_3":"F1 is the harmonic mean of? ① Accuracy and precision ② Precision and recall ③ Recall and accuracy","mlEvaluationQuestionChoice_4":"TP means? ① Actual pos, predicted pos ② Actual neg, predicted pos ③ Actual pos, predicted neg","mlEvaluationQuestionChoice_5":"False positive is? ① FP ② FN ③ TN","mlEvaluationQuestionChoice_6":"False negative is? ① FP ② FN ③ Precision","mlEvaluationQuestionChoice_7":"Total count n is? ① TP+TN ② TP+TN+FP+FN ③ TP+FP+FN","mlEvaluationScenario_0":"We must not miss spam (some false positives acceptable). Important metric? ① Recall ② Precision ③ Accuracy","mlEvaluationScenario_1":"In medical diagnosis, we must not say 'no disease' when there is. Important metric? ① Accuracy ② Recall ③ Precision","mlEvaluationScenario_2":"In ad click prediction, we want to raise 'fraction of predicted clicks that are real'. Important metric? ① Recall ② Precision ③ F1","mlEvaluationScenario_3":"In fraud detection we must not miss fraud. Important metric? ① Precision ② Recall ③ Accuracy","mlEvaluationScenario_4":"To balance precision and recall we use? ① Accuracy ② F1 ③ TP","mlEvaluationScenario_5":"When classes are 99:1 imbalanced, accuracy alone? ① Is reliable ② Can be misleading ③ Equals F1","mlEvaluationScenario_6":"The metric closest to 'fraction of relevant docs in top 10' is? ① Recall ② Precision ③ FN","mlEvaluationScenario_7":"The metric for 'fraction of actual positives that the model got right' is? ① Precision ② Recall ③ Accuracy","mlKmeans":{"chapter":"Chapter 08","title":"K-Means Clustering: Grouping Without Labels","description":"K-Means is a classic **unsupervised learning** algorithm that groups data into K clusters using **distance**—no labels. You will see how the 'unsupervised' idea from Ch01 works in practice: concept → intuition → math → application. It reuses the distance formula from Ch02 (KNN) and shows how repeating 'assign to nearest center' and 'update centers' yields clear clusters.","sectionTitle":"K-Means Clustering: Grouping Without Labels","whatIs":{"0":"**What is K-Means?** — With no labels $y$, only data $\\mathbf{x}_1, \\mathbf{x}_2, \\ldots$, K-Means partitions points into **K groups** by **nearest centroid**. Distance is **Euclidean** $d(\\mathbf{x}, \\boldsymbol{\\mu}) = \\sqrt{\\sum_j (x_j - \\mu_j)^2}$ (as in Ch02). Each group has one **centroid** $\\boldsymbol{\\mu}_k$. The algorithm alternates: assign each point to the nearest center → set each center to the mean of its assigned points, until convergence.","1":"**K is the number of clusters** — The user chooses **K** (e.g. K=2 → two groups). There are no 'correct' labels, only a partition. In practice, K is chosen by domain knowledge, the elbow method, or silhouette scores.","2":"**Objective: minimize SSE (distortion)** — K-Means minimizes $J = \\sum_{k=1}^K \\sum_{i \\in C_k} \\|\\mathbf{x}_i - \\boldsymbol{\\mu}_k\\|^2$. The update $\\boldsymbol{\\mu}_k = \\frac{1}{|C_k|}\\sum_{i \\in C_k} \\mathbf{x}_i$ (mean of assigned points) reduces each cluster's SSE.","3":"**If the formulas feel heavy** — The distance formula is just 'length between a point and a center.' SSE $J$ is a single number for 'how tightly points sit around their center'; the algorithm moves centers to make $J$ smaller. The centroid update is literally 'average of the coordinates of points in that cluster.' The **Formula guide** below spells out each symbol step by step."},"whyImportant":{"0":"**Ch01 unsupervised learning in action** — K-Means is the go-to when you have no labels and want structure (e.g. customer segmentation, clustering documents or images, preprocessing for anomaly detection).","1":"**Customer segmentation** — With only purchase history and no segment labels, K-Means groups similar customers; people then attach meaning (e.g. VIP, churn risk) to each cluster and use it for downstream tasks (Ch09, Ch12).","2":"**Simple and interpretable** — Assign (nearest center) and update (mean) are easy to implement and visualize in 2D."},"howUsed":{"0":"**Clustering** — Customer segmentation, topic/document grouping, image color compression, gene expression groups.","1":"**Preprocessing** — Use cluster index as a new feature for supervised models, or keep only centroids to reduce data size.","2":"**Choosing K** — The user sets K; compare SSE or silhouette across K to pick a value (e.g. elbow)."},"problemSolving":{"0":"**Summary**\n\n(1) **Input**: Unlabeled points and cluster count $K$.\n\n(2) **Initialize**: Place $K$ centroids at random or by heuristic.\n\n(3) **Assign**: Each point goes to the nearest center's cluster.\n\n(4) **Update**: Set each center to the mean of its assigned points.\n\n(5) **Repeat**: Steps 3–4 until assignment and centers no longer change.\n\n**Objective**: Minimize SSE (distortion) $J = \\sum_{k}\\sum_{i \\in C_k} \\|\\mathbf{x}_i - \\boldsymbol{\\mu}_k\\|^2$.\n\n**Centroid update**: $\\boldsymbol{\\mu}_k = \\frac{1}{|C_k|}\\sum_{i \\in C_k} \\mathbf{x}_i$\n\nSee the table and examples below.","1":"**Terminology**\n\n| Item | Description |\n| :--- | :--- |\n| **Distance squared** | For two points $(x_1,y_1)$, $(x_2,y_2)$: $(x_2-x_1)^2+(y_2-y_1)^2$. No need for the square root when comparing. |\n| **Assign** | For a point and $K$ centers, compute distance (or distance²) to each; the **smallest center index** (1-based) is that point's cluster. |\n| **Center update** | New center = (mean of $x$, mean of $y$) of points in that cluster; round if needed. |\n| **SSE** | In one cluster: $J = \\sum_{i \\in C_k} \\lVert\\mathbf{x}_i - \\boldsymbol{\\mu}_k\\rVert^2$ (sum of squared distances to center). |\n\n---\n\n**Example (assign)**\n\nCenters $\\mu_1=(0,0)$, $\\mu_2=(4,0)$; point $(2,0)$. Distance²: $d_1^2=4$, $d_2^2=4$; tie → cluster 1. **Answer 1**\n\n---\n\n**Example (center update)**\n\nPoints $(1,2)$, $(3,4)$ in cluster 1 → new center $\\bar{x}=2$, $\\bar{y}=3$. **(2, 3)**","2":"$20"},"visual":""},"mlCrossValidation":{"chapter":"Chapter 09","title":"Cross Validation: Practice Tests and the Real Exam","description":"Cross validation is essential so that models do not become \"frogs in a well\"—only good at the exercises they memorized. Just as students use **practice tests** to check their real level and the **final exam** to confirm it, we do not score machine learning models only on **training data**; we evaluate them on **validation** and **test** data they have not seen. This chapter covers **cross validation** (Hold-out, K-Fold, etc.) and how to make performance estimates reliable.","sectionTitle":"Cross Validation: Practice Tests and the Real Exam","whatIs":{"0":"**What is cross validation? \"Don’t score with the same problems they practiced\"** — If a math exam contained only problems from the workbook, we could not tell whether students understood the ideas or had **overfit** by memorizing answers. The same holds for ML: testing on training data always looks good. So we split data into **train**, **validation**, and **test**, and evaluate the model strictly and fairly on data it has never seen. That process is cross validation.","1":"**Three roles when splitting data** — The ideal split and role of each part are as follows.\n\n| Data type | Metaphor | Role and use | Typical ratio |\n| :--- | :--- | :--- | :--- |\n| **Training (Train)** | Textbook / practice set | Main data used to learn patterns and update weights. | ~70–80% |\n| **Validation** | Practice exam | Used mid-learning to check performance and tune hyperparameters. | ~10–15% |\n| **Test** | Final exam | Used **only once** after all learning to report final performance. | ~10–15% |","2":"**How to split? Hold-out and K-Fold** — There are two main approaches. **Hold-out** is like cutting a pizza once: you split the data once into train and test. It is simple and fast, but if by chance the \"easy\" part ends up in the test set, the estimate can be overly optimistic. **K-Fold cross validation** divides data into K segments and uses each in turn as the \"practice exam\" (validation) and the rest for training, so every sample is validated once and the estimate is more stable and objective.","3":"**K-Fold final score in a formula** — After K-Fold you have K \"exam\" scores. The model’s final performance is the average of these K scores.\n\n* **Mean score formula:** $\\bar{S} = \\frac{1}{K}\\sum_{k=1}^K S_k$\n\n* **Symbols:** $K$ = number of folds (number of validation runs), $S_k$ = score when the $k$-th fold was used for validation (e.g. accuracy or MSE). $\\sum_{k=1}^K S_k$ means $S_1 + S_2 + \\cdots + S_K$, so $\\bar{S}$ is the **mean of the K validation scores** and is used as the final performance estimate.\n\n* **Numeric example:** With 5-Fold, if the five scores are 80, 85, 90, 80, 85, then $\\bar{S} = (80+85+90+80+85)/5 = 84$."},"whyImportant":{"0":"**Escaping the \"frog in a well\" (detecting overfitting)** — If the model scores 99 on training data but 50 on unseen validation data, it is almost certainly **overfitting** (memorizing rather than understanding). Cross validation acts as a filter to catch such models before they fail in production.","1":"**Proving real-world performance (generalization)** — Companies adopt AI to predict the future, not to replay the past. Models validated with K-Fold and a held-out test set are more likely to perform well on truly new data.","2":"**Finding the best setup (hyperparameters and model choice)** — When choosing tree depth, K in K-NN, learning rate, etc., we run multiple settings on the validation set and pick the best. Because the test set is kept separate, we can compare models fairly."},"howUsed":{"0":"**Data scientist routine (production pipeline)** — In practice, the first step is to set aside about 10% of the data as the **test set** and lock it away. The rest is used for training and K-Fold validation until the best model is ready; then the test set is used once to report: \"Our model’s final accuracy is 92%.\"","1":"**Fair algorithm comparison** — When asking \"Is logistic regression or random forest better for our churn prediction?\", the same K-Fold setup is applied to both; the algorithm with the higher mean validation score ($\\bar{S}$) is chosen for deployment."},"problemSolving":{"0":"**Summary** — Cross validation starts from the premise that we must not measure performance only on the data used for training. Just as students take practice tests before the real exam, in machine learning we cannot tell if the model has \"memorized the exercises\" if we score only on **training data**. So we split data into **train**, **validation**, and **test**. The **training** set is used for the model to learn patterns; the **validation** set is used to check performance during learning or to choose hyperparameters; the **test** set is used **only once** after all learning to report final performance before deployment. The main split strategies are **Hold-out** and **K-Fold**. Hold-out splits the data once into train and test (or validation). K-Fold divides data into K segments, uses one segment at a time for validation and the rest for training. With K-Fold every sample is used for validation once, so the performance estimate is more stable than with a single split.","1":"$21","2":"**By problem type** — ① **Definition (T/F)**: 1 if true, 0 if false. ② **Hold-out train count**: $n \\times (\\text{ratio}/100)$ (ratio 50% or 80%). ③ **Hold-out test count**: $n - \\text{train size}$. ④ **K-Fold fold size**: $\\lfloor n/K \\rfloor$. ⑤ **K-Fold mean**: $\\bar{S} = \\frac{1}{K}\\sum_k S_k$; scores in % (integer), answer = sum ÷ K. ⑥ **Multiple choice (1–3)**: Hold-out = single split, K-Fold = multiple validation runs, Stratified = preserve class ratio."},"visual":""},"mlEvaluation":{"chapter":"Chapter 10","title":"Classification Metrics: The Model's Detailed Report Card","description":"Learn the **'detailed report card'** that a classification AI model receives after its test. Beyond \"how many did you get right?\" (accuracy), we look at **confusion matrix** concepts that ask \"which questions did you get wrong, and how?\" In business settings where *how* the model is wrong can be critical—spam filters, cancer diagnosis AI—we explain how **precision, recall, and F1** prove the model's real capability, with intuitive analogies.","sectionTitle":"Classification metrics: confusion matrix and the model's report card","whatIs":{"0":"**What is the confusion matrix? The AI's detailed report card** — Just as knowing only \"how many correct\" on an exam doesn't tell you whether a student is good at math or English, we need more for a classifier. The **confusion matrix** is a 2×2 table that compares the model's **predictions (columns)** with **actual answers (rows)**. By reading the four cells, you can see what the model gets right and where it gets confused and stumbles.","1":"**The four cells: TP, TN, FP, FN** — Think of the famous \"boy who cried wolf.\" Here 'positive' means the boy cries wolf; 'negative' means peace.\n* **TP (True Positive):** Wolf really came (1), boy cried wolf (1). Best outcome—village saved.\n* **TN (True Negative):** No wolf (0), boy stayed quiet (0). Peace.\n* **FP (False Positive):** No wolf (0), boy cried wolf (1). Villagers run out with pitchforks for nothing (false alarm).\n* **FN (False Negative):** Wolf came (1), boy was asleep (0). Sheep get eaten—worst outcome (miss).\n* Total count $n = \\mathrm{TP} + \\mathrm{TN} + \\mathrm{FP} + \\mathrm{FN}$.","2":"**Accuracy's dangerous trap** — It is the fraction of correct answers: $\\text{Accuracy} = \\frac{\\mathrm{TP}+\\mathrm{TN}}{n}$. Intuitive but treacherous. Suppose 99 out of 100 days are peaceful and the wolf comes only once. A robot that closes its eyes and always says \"No wolf!\" still gets 99% accuracy. When positive cases are rare (imbalanced data), you must not trust accuracy alone.","3":"**Precision and recall: two rabbits to chase** —\n* **Precision (caution):** \"When I cried wolf, how often was it really the wolf?\" The share of **predicted positives** that are **truly positive**. $\\text{Precision} = \\frac{\\mathrm{TP}}{\\mathrm{TP}+\\mathrm{FP}}$. It goes up when you avoid false alarms (FP).\n* **Recall (sensitivity):** \"Of all the times the wolf actually came, how often did I notice and warn?\" The share of **actual positives** that the model **got right**. $\\text{Recall} = \\frac{\\mathrm{TP}}{\\mathrm{TP}+\\mathrm{FN}}$. It goes up when you miss fewer true wolves (FN).","4":"**F1 score: the golden balance of precision and recall** — Precision and recall are like a seesaw: pushing one up often pushes the other down. **F1** summarizes both in one number using the **harmonic mean**: $\\text{F1} = \\frac{2 \\cdot \\mathrm{TP}}{2\\cdot\\mathrm{TP}+\\mathrm{FP}+\\mathrm{FN}}$. If either precision or recall is poor, F1 tanks. Use F1 when you want a model with good balance.","5":"**AUC (Area Under the ROC Curve): the model's ranker** — When the model outputs a probability (e.g. \"90% chance of wolf\") rather than a bare yes/no, **AUC** measures how well **true positives** get higher scores than **true negatives** (discriminative power), on a 0–1 scale. 1 = perfect ranking; 0.5 = coin flip. Very useful to compare models before choosing a threshold."},"whyImportant":{"0":"**Don't fall for 99% accuracy** — Imagine a credit-card fraud detector: 1 fraudulent transaction in 100,000. A model that does nothing and always says \"all normal\" still has 99.999% accuracy—but 0% recall (catches no fraud). You must open the **confusion matrix** and inspect **precision** and **recall** to see if the model is doing its job or gaming the numbers.","1":"**In practice, it's a fierce trade-off: which mistake can you live with?** — The metric you bet on depends on the business.\n* **Recall (don't miss) is life:** Cancer screening. Better to have healthy people get extra tests (FP) than to miss a real case (FN) and delay treatment.\n* **Precision (fewer false alarms) is life:** Spam filter. Missing a few spams (FN) is fine—delete and move on. Misclassifying the boss's email as spam (FP) can be career-threatening."},"howUsed":{"0":"**Final pass/fail for AI services (binary classification)** — COVID-19 positive/negative, YouTube harmful-video block/allow, bank loan approve/reject: before deployment, real-world projects draw the confusion matrix and review precision, recall, and F1.","1":"**Tuning alarm sensitivity (threshold tuning)** — Models usually output a probability. \"At what % do we sound the alarm?\" Adjusting this threshold tailors the model to the business: e.g. lower threshold for maximum recall (security-critical), higher for maximum precision (when too many false alarms annoy users)."},"problemSolving":{"0":"Don't judge a classification model by correct count alone. Fill a **confusion matrix** (actual rows, predicted columns) with TP, TN, FP, FN. **Accuracy** = (TP+TN)/n. **Precision** = TP/(TP+FP). **Recall** = TP/(TP+FN). For imbalanced data, emphasize precision (fewer false alarms) or recall (fewer misses) by goal; **F1** for balance. In practice, combine these for spam, diagnosis, fraud, and threshold choice.","1":"**Exact meaning of each (in words)** — **TP**: count where actual positive and predicted positive. **TN**: actual negative, predicted negative. **FP**: actual negative, predicted positive (false positive). **FN**: actual positive, predicted negative (miss). **Accuracy**: fraction of all samples that are correct. **Precision**: of predicted positives, fraction that are truly positive. **Recall**: of actual positives, fraction the model got. **F1**: harmonic mean of precision and recall. **AUC**: how well positives are ranked above negatives (0–1), independent of threshold.","2":"$22"},"visual":""},"mlRegularization":{"chapter":"Chapter 11","title":"Regularization: Beyond Rote Memorization"},"mlRecommendation":{"chapter":"Chapter 12","title":"Collaborative Filtering: Recommendation Basics"}},"mathChapters":{"mathCumulativeVisualTitle":"Basic math concept flow","mathCumulativeVisualLabel":"Basic math chapter concept visual","sectionLabels":{"whatIs":"What the concept is","whyImportant":"Why it matters","howUsed":"How it is used","problemSolving":"Explanation for solving the problems"},"mathIntro":{"chapter":"Chapter 00","title":"Basic Math and AI: Learning the Language of AI","description":"Why math is needed to understand deep learning and machine learning, what math tools are used—we draw that map together.","sectionTitle":"Why do we need math to understand deep learning and machine learning?","visualIntro":"","visualInputLabel":"Input","visualInputTypes":"Image, text, sound","visualMathLabel":"Basic math","visualMathTopics":"Functions · vectors · matrices","whatIs":{"0":"**Understanding AI requires math as a lens** — Deep learning and machine learning turn the images, text, and sound we give them into **numbers**. Those numbers pass through **functions** and repeated **multiplication and addition** to find the answer. Because this whole process is written in math, knowing math lets you read the **inner workings** of AI clearly.","1":"**What math tools will we use?** — We will learn **functions** (rules that map input to output), **vectors and matrices** (bundling lots of data for batch computation), **differentiation** (so the model can learn and move toward the right answer), and **probability and distributions** (to measure how likely an outcome is). These tools together build the intelligence of AI.","2":"**In short** — AI runs on a solid foundation of numbers and functions. To interpret why AI produced a given result and to build better models, you need basic strength in **functions**, **limits**, **differentiation**, and **probability**. This course is the journey of building that foundation step by step."},"whyImportant":{"0":"**To understand why AI decides as it does** — Every decision AI makes is ultimately the result of **numbers and functions**. We learn functions and differentiation so we can follow the computation and logically understand **why that answer was produced**.","1":"**Where math works in the AI model** — Each **layer** of the model is a set of **functions** that multiply by weights and add. The process of the model learning and reducing error uses the concept of **gradient** (differentiation). Probability becomes the measure of how confident the AI is in its prediction.","2":"**The roadmap we will follow (Ch01–Ch12)** — This course proceeds in order: **Functions (Ch01–03)** (flow of data), **Limits and continuity (Ch04–05)** (foundations of change), **Differentiation (Ch06–08)** (heart of learning), **Integral (Ch09)** (accumulation and basis of probability), and **Probability and distributions (Ch10–12)** (uncertainty)."},"howUsed":{"0":"**The link between reality and math** — An AI model has the structure **input → turn into numbers → repeat functions → output**. **Functions** are the building blocks, **differentiation** is the chisel that shapes them to get smarter, and **probability** is the tool that checks the stability of the finished building. Once you master this basic math, the complex formulas of deep learning start to read like meaningful sentences."},"problemSolving":{"0":"| Category | Role in AI | Key math concepts |\n| --- | --- | --- |\n| **Input & output** | Basic framework for feeding data and getting answers | Functions, exponents, logarithms |\n| **Learning (Training)** | Process of reducing error to approach the correct answer | Limits, derivatives, chain rule |\n| **Prediction & decision** | Choosing the best among uncertain outcomes | Probability, statistics, normal distribution |"}},"mathFunctions":{"chapter":"Chapter 01","title":"Functions: The Basic Unit of AI That Connects Input and Output","description":"A function is a rule that assigns one output to each input. The way AI turns input into output is directly connected to this function concept.","sectionTitle":"What is a function?","visualIntro":"One input x gives exactly one output y. The diagram below shows the flow x → f → y.","visualCaption":"Example: x = 3 gives 7 for f(x) = 2x + 1","whatIs":{"0":"A **function** is a strict **mapping** between two sets. Every element of the **domain** (the set of inputs) must be connected to **exactly one** element of the **codomain** (the set of outputs). Just as a vending machine is broken if pressing a button gives no drink or two drinks at once, a function must have exactly one output for each input.","1":"We write **y = f(x)**. Here **x** is the **independent variable (cause)** and **y** is the **dependent variable (result)**. From an AI perspective, **x** is the **data** we provide (pixels, text, sensor values), and **y** is the **prediction** the AI computes. The function **f** acts as a **transformer** that turns this data into answers.","2":"An **AI model** itself is a huge **composite function**. Input data is transformed by the first function (layer), and that result is fed into the next function (layer); this repeats dozens of times. Just as we write $y = f(g(h(x)))$ in math, deep learning stacks many functions in layers to read complex patterns from data."},"whyImportant":{"0":"Because we can **model the real world**. A vague relation like \"more study leads to better grades\" can be expressed as a **linear function** $y = ax + b$, so we can compute expected grades ($y$) from study time ($x$). AI approximates far more complex nonlinear relations (e.g., images to object names) as functions to solve problems.","1":"Functions are the **object of optimization**. The goal of AI training is to minimize the error between the correct answer and the prediction. That error is computed by a **loss function**, and we use differentiation to find its minimum. Without functions, there would be no mathematical basis for training AI.","2":"They are the language of **change**. We need to know how much the output changes when the input changes a little (the slope) so that AI can move step by step toward the correct answer. Functions make the **cause–effect** relationship between input and output explicit in math, so we can analyze **why** the AI made a given decision."},"howUsed":{"0":"Every **neuron** in AI is a small **function**. It takes input signals ($x$), multiplies them by weights ($w$) and adds ($wx+b$), then passes the result through an **activation function** to the next neuron. Functions like **ReLU** and **Sigmoid** decide whether to pass the signal on; many such small functions together make complex decisions like the human brain.","1":"**Data transformation**: A photo is just a pile of numbers ($x$) to the computer. AI passes them through functions to shrink or expand dimensions and keep only key features ($y$) like \"ear shape\" or \"eye shape.\" That's mapping high-dimensional vectors to a lower-dimensional space.","2":"**Probability**: The **softmax** function at the last step of classification turns raw scores into \"probabilities that sum to 1.\" So the AI can say \"this image is 90% a dog.\" Functions turn raw data into information we can interpret."},"problemSolving":{"0":"| Function | Example (input → output) |\n| --- | --- |\n| $f(x)=x+1$ | 3 → 4, 10 → 11 |\n| $g(x)=2x$ | 3 → 6, 10 → 20 |\n| $h(x)=x^2$ | 3 → 9, $-2$ → 4 |","1":"In the visual below, **f(x) = 2x + 1** gives 7 for x = 3 and 21 for x = 10. Fill in the blank in the problem."}},"mathVideoExponential":{"chapter":"Chapter 02","title":"Exponents and Exponential Functions: The Math of Growth and Activation","description":"Exponentiation is repeated multiplication of the same base; an exponential function fixes the base and uses the exponent as the variable. Used in activation and loss design in deep learning.","sectionTitle":"What are exponent and exponential function?","visualIntro":"Fix a base $a$; for each exponent $x$ the value $a^x$ is determined. Below are examples for $2^x$.","visualCaption":"Example: $2^0=1$, $2^1=2$, $2^2=4$, $2^3=8$","whatIs":{"0":"An **exponent** is how many times a number (the **base**) is multiplied by itself. Like the fact that folding a piece of paper 42 times would reach the moon, repeated **multiplication** (not addition) makes values grow **explosively (exponential growth)**.","1":"An **exponential function** puts that repeated power in a variable: $y = a^x$. In polynomials the variable is in the base ($x^2$); in exponentials the variable is in the **exponent**. That means growth proportional to current size. If $a>1$, the value shoots up as $x$ increases (**exponential growth**); if $00$. AI cannot say \"the probability is -50%,\" so exponentials are essential when we need outputs to be **positive** (e.g. probabilities or positive scores).","1":"They **amplify small differences**. Inputs 1 and 2 differ by 1, but $10^1=10$ and $10^2=100$ differ by 90. AI uses this to **sharply separate** similar data and **classify** with confidence.","2":"**Efficient differentiation**: Backprop is a long chain of derivatives. The exponential $e^x$ keeps the same shape when differentiated (or stays in a simple form), which is crucial for fast, stable training."},"howUsed":{"0":"Used in the **softmax** function. When AI chooses one out of 1000 images, it applies $e^x$ to each score. Slightly higher scores get much larger values and lower ones shrink toward 0, so the model can say \"this is the answer with 99% confidence.\"","1":"The **sigmoid** function $y = \\frac{1}{1+e^{-x}}$ squeezes the input into (0, 1). The output never exceeds 1 or goes below 0, so the neuron acts like an on/off switch."},"problemSolving":{"0":"| Expression | Value |\n| --- | --- |\n| $2^0$ | 1 |\n| $2^1$ | 2 |\n| $2^2$ | 4 |\n| $2^3$ | 8 |\n| $2^4$ | 16 |\n| $3^2$ | 9 |\n| $3^3$ | 27 |","1":"In the visual below, $y = 2^x$ gives $1$ for $x=0$, $2$ for $x=1$, $4$ for $x=2$, $8$ for $x=3$. Use it to see how base and exponent relate.","2":"**Problem types and how to solve them**\n\n| Type | Description | How to get the answer |\n| --- | --- | --- |\n| **Find value** | $a^x = ?$ | Multiply base $a$ by itself $x$ times. E.g. $2^3 = 8$. |\n| **Find exponent** | $a^? = \\text{value}$ | \"How many times do we multiply $a$ to get this value?\" That count is the answer. E.g. $2^? = 8 \\Rightarrow 3$. |\n| **Compare** | Which is larger: 1) $a^{m}$, 2) $b^{n}$? | Compute each, then compare. If (1) is larger enter **1**, if (2) enter **2**. |\n| **Product, same base** | $a^p \\times a^q = a^?$ | **Add** exponents: $? = p + q$. (Rule: $a^p \\cdot a^q = a^{p+q}$) |\n| **Quotient, same base** | $a^p \\div a^q = a^?$ ($p \\ge q$) | **Subtract** exponents: $? = p - q$. (Rule: $a^p / a^q = a^{p-q}$) |\n| **Power of power** | $(a^p)^q = ?$ | **Multiply** exponents: $? = a^{p \\times q}$. (Rule: $(a^p)^q = a^{pq}$) |"}},"mathVideoLog":{"chapter":"Chapter 03","title":"Logarithm: From Multiplication to Addition, the Language of Loss Design","description":"A logarithm answers 'how many times we multiply the base to get this number?' It is the inverse of exponentiation and is used with exponentials in loss and probability in deep learning.","sectionTitle":"What is the logarithm?","visualIntro":"Logarithm is the inverse of exponent. $y = \\log_2 x$ means $2^y = x$. Below are the graphs of $y = \\log_2 x$ and its inverse $y = 2^x$.","visualCaption":"Example: $\\log_2 1 = 0$, $\\log_2 2 = 1$, $\\log_2 4 = 2$, $\\log_2 8 = 3$ (when $2^y = x$, $y$ is $\\log_2 x$)","visualLegend":"Purple: $y=\\log_2 x$, Teal: $y=2^x$","whatIs":{"definition":"The **logarithm** is like \"running exponentiation backward.\" In $2^3 = 8$, when you see the result 8 and ask \"**how many times** did we multiply 2 to get 8?\", that count (3) is the logarithm: $\\log_2 8 = 3$. Here 2 is the **base** and 8 is the **argument**.","example":"Think of it as **counting digits**. $100 = 10^2$ so $\\log_{10} 100 = 2$; $1000 = 10^3$ so $\\log_{10} 1000 = 3$. When the number grows 10×, the log value only goes up by 1. So log acts as a **filter** that turns explosively large numbers into much gentler ones. **Basic properties**: $\\log_a 1 = 0$ (base to the 0th power is 1), $\\log_a a = 1$ (base to the 1st power is itself).","logSumProduct":"**The magic of log** is that it turns multiplication into addition: $\\log_a(b \\times c) = \\log_a b + \\log_a c$. For computers, multiplication is costlier than addition and can overflow or underflow; taking the log turns that multiplication into a safer, simpler addition.","whyInAI":"The **argument condition ($x>0$)** matters: log of 0 or a negative number is undefined. So in AI code we often add a tiny constant ($\\epsilon$, epsilon) to avoid $\\log(0)$ errors. The **natural log** ($\\ln$, base $e$) keeps differentiation tidy and is the standard in deep learning."},"whyImportant":{"0":"**Avoiding underflow** is essential. If AI multiplies probability $0.1$ a hundred times ($0.1^{100}$), the computer may treat it as zero. Taking the log gives $\\log(0.1^{100}) = 100 \\times \\log(0.1) = -100$—a **meaningful number** the computer can still handle.","1":"It is the **ruler for information (entropy)**. The rarer an event, the larger (in absolute value) its log. A rare event (e.g. \"sun rises in the west\") carries high information; an obvious one (\"morning comes\") carries almost none. AI uses this log-based measure to see **how much surprising information** was learned.","2":"**It penalizes mistakes harshly**. For $y=\\ln x$ with $0② “Undo” of $3$ is $3x$.
③ $3\\cdot 2 - 3\\cdot 0 = 6$ → **6** |\n| **Ex 2.** $\\int_1^3 2x\\,dx$ | ① Lower 1, upper 3.
② “Undo” of $2x$ is $x^2$.
③ $3^2 - 1^2 = 8$ → **8** |\n| **Ex 3.** $\\int_0^2 (1+x)\\,dx$ | ① Lower 0, upper 2.
② “Undo”: $x + x^2/2$.
③ $(2+2)-(0+0)=4$ → **4** |\n| **Ex 4.** Given $\\int 2x\\,dx = x^2+C$, value at $x=2$? | ① Substitute into $x^2+C$.
② $x=2$, $C=0$ ⇒ $2^2 = 4$ → **4** |"},"visualCaption":"The definite integral represents the area under the curve. Find an antiderivative and plug in the upper and lower limits.","visualIntroShort":"Integration is the inverse of differentiation. It is used for area under a curve, cumulative quantities, and probability intervals.","visualConceptArea":"Definite integral = area between the curve and the x-axis","visualConceptSweep":"From a to b, the area accumulates.","visualConceptSlices":"Dividing the interval and summing gives the area.","visualConceptGap":"The gap between the rectangles and the curve shrinks as you use more slices; in the limit you get the exact area (the integral)."},"mathVideoRandomVariable":{"chapter":"Chapter 10","title":"Random Variables and Probability Distributions: Capturing Uncertainty in Numbers","description":"A random variable assigns numbers to outcomes of an experiment; a probability distribution summarizes how likely each value is. Used in deep learning for prediction and uncertainty.","sectionTitle":"What are random variables and probability distributions?","whatIs":{"intro":"A **random variable (Random Variable)** maps the outcome of a trial (experiment) to **numbers**. It is usually written $X$. For example, the moment we agree that heads = $1$ and tails = $0$, the real-world act of flipping a coin becomes the mathematical variable $X$. A **probability distribution** is the rule that shows at a glance (like a map) with what probability each of those numbers appears.","discrete":"**① Discrete random variable** — takes only **finite or countable** values. It can be shown in a table, as a function, or as a **bar graph**. The probability $P(X=k)$ for each value $k$ is the **probability mass function (PMF)**; the essential condition is $\\sum_k P(X=k)=1$.","discreteExamples":"**Representative discrete distributions**: The **binomial distribution** deals with the number of heads when flipping a coin multiple times. The **Poisson distribution** deals with event counts such as how many customers arrive in a given time period.","continuous":"**② Continuous random variable** — takes **infinitely many** values in an interval. The probability of any **single** value (e.g. exactly 170.00 cm) is $0$, because the area under a curve at a single point is zero. We use a **probability density function (PDF)** for probabilities over **intervals** (e.g. 170–180 cm). It's expressed by a function and a **curve**, not a table.","continuousExamples":"**Representative continuous distribution**: The bell-shaped **normal distribution** is most representative, as many natural data (measurement error, score distributions, etc.) follow it.","distribution":"A **probability distribution** is the **rule** for which values occur and how often. The figure shows **normal (continuous), Poisson (discrete), and binomial (discrete)** — knowing these covers most uses in AI.","pmfIntro":"The **probability mass function (PMF)** is the probability $P(X=k)$ for each value $k$ of a discrete random variable. In a bar chart, the height of each bar is that probability, and the sum of all bar heights is 1. The figure below shows three common distributions.","visualConnect":"**Connecting to the figures** — **Figure 1** (above): the **normal** (left) is continuous (curve); **Poisson** and **binomial** (center, right) are discrete (bars). **Figure 2** compares discrete (bars) and continuous (curve) side by side. In AI: normal for noise and regression, Poisson for event counts, binomial for success counts and binary classification.","distPmfSum":"**Distribution condition (discrete)** — The PMF is the probability $P(X=k)$ of each value $k$. Essential: $\\sum_k P(X=k)=1$. (e.g. For a die, $P(1)+\\cdots+P(6)=1$.)","distPmfSumPlain":"In plain words: For discrete distributions, all the probabilities of the possible outcomes must add up to 1. Just like a die—the chances of 1 through 6 add up to 1.","distPdfIntegral":"**Distribution condition (continuous)** — The PDF $f(x)$ gives probability over intervals: $P(a\\le X\\le b)=\\int_a^b f(x)\\,dx$, and the total area is $\\int_{-\\infty}^{\\infty} f(x)\\,dx=1$.","distPdfIntegralPlain":"In plain words: For continuous distributions, probability is the area under the curve. The probability that X falls in [a,b] is the area under the curve from a to b, and the total area under the whole curve is 1.","distExpectation":"**Expectation (mean)** — Discrete: $E[X]=\\sum_k x_k\\, P(X=k)$; continuous is given by an integral. The “average weighted by probability.”","distExpectationPlain":"In plain words: Expectation is the average value when each outcome is weighted by its probability. For a die, it's (1×1/6)+(2×1/6)+…+(6×1/6)=3.5—the \"probability-weighted\" average.","distVariance":"**Variance** — $\\mathrm{Var}(X)=E[(X-E[X])^2]$. Standard deviation is $\\sigma=\\sqrt{\\mathrm{Var}(X)}$. Ch11 covers this in detail.","distVariancePlain":"In plain words: Variance measures how spread out the values are from the mean. You take (each value minus the mean), square it, then average by probability; the square root of variance is the standard deviation.","distFormulaNormal":"**Normal distribution (continuous)** — Density $f(x)=\\frac{1}{\\sigma\\sqrt{2\\pi}}\\,e^{-(x-\\mu)^2/(2\\sigma^2)}$. $\\mu$ is the mean, $\\sigma$ the standard deviation.","distFormulaNormalPlain":"In plain words: A symmetric bell-shaped curve centered at the mean μ. The spread is controlled by σ (standard deviation)—larger σ means a wider, flatter curve. Often used for heights, measurement error, and noise.","distFormulaPoisson":"**Poisson distribution (discrete)** — $P(X=k)=\\frac{\\lambda^k e^{-\\lambda}}{k!}$ ($k=0,1,2,\\ldots$). $\\lambda$ is the average number of events in a fixed interval.","distFormulaPoissonPlain":"In plain words: Used when counting how many times an event happens in a fixed time or space. λ is the average count; the formula gives the probability of exactly k events. The bar chart is usually skewed to one side.","distFormulaBinomial":"**Binomial distribution (discrete)** — $P(X=k)=\\binom{n}{k}p^k(1-p)^{n-k}$. $n$ = number of trials, $p$ = success probability per trial.","distFormulaBinomialPlain":"In plain words: You run the same trial n times and count how many successes (k). p is the chance of success on one trial. Like flipping a coin n times and counting heads—often gives a symmetric, peaked bar chart."},"whyImportant":{"prediction":"It is the **basis of prediction and decision**. AI does not just say “this is a cat.” It outputs a **probability distribution** — e.g. “probability of cat 0.98, dog 0.02” — as a **random variable**. From that distribution we see how confident the model is.","inAI":"**Managing uncertainty**: Real data is noisy and uncertain. By modeling measurement error with the **normal** distribution or binary outcomes (e.g. spam or not) with the **binomial**, AI uses probability to reach the most reasonable conclusion."},"howUsed":{"daily":"**Everyday statistics**: rain probability (discrete), average lifespan or height distribution (continuous). Many quantities around us are described by random variables and distributions — **discrete** (bars) vs **continuous** (curves) — so we can read the world clearly.","insideMath":"**Inside deep learning**: Weights are often initialized with the **normal** distribution; the last layer uses **softmax** to turn outputs into a probability distribution (sum 1). Probability distributions are involved at every stage; understanding them shows how AI generates and classifies data."},"problemSolving":{"focus":"For a discrete random variable: **① list possible values and their probabilities → ② check that probabilities sum to 1 → ③ expectation = sum of (value)×(probability)**.","probSum":"**Sum of probabilities** — $P(X=1)+P(X=2)+P(X=3)=1$. With denominator 6, $a/6+b/6+c/6=1$ gives $a+b+c=6$. Knowing two of $a,b,c$ gives the third.","expectation":"**Expectation** — $E[X]=x_1 p_1+x_2 p_2+x_3 p_3$. When the denominator is 6, $6\\cdot E[X]$ is an integer, so problems may ask for “6×expectation”.","exampleIntro":"**Examples** — Fill the blank so probabilities sum to 1, or find 6×expectation.","example1":"**Ex 1.** Three probabilities a/6, b/6, c/6 sum to 1, so a+b+c=6. If a=1 and b=2, then c=3.","example2":"**Ex 2.** Values 1, 2, 3 with probabilities 1/6, 2/6, 3/6: 6×expectation = 1×1+2×2+3×3 = 14."},"visualTitleRandomVariable":"Random Variable","visualLabelDiscrete":"Discrete","visualLabelContinuous":"Continuous","visualAxisP":"P(x)","visualAxisF":"f(x)","visualAxisX":"x","visualCaption":"Value k → P(X=k)=PMF, sum 1. Distribution=rule. AI: classification·generation·loss.","visualCaptionLong":"**In AI**, we treat predictions as “possible values + their probabilities”—that’s a random variable and distribution. Used in classification (dog 70%, cat 30%), generation (picking the next word), and loss (cross-entropy).","visualLabelPmf":"PMF (discrete)","visualLabelUniform":"Uniform (dice)","visualLabelSum":"Σ = 1","visualIntroShort":"A random variable turns outcomes into numbers; a probability distribution summarizes how likely each value is.","samplingDistTitle1":"Probability distribution of a dice roll","samplingDistTitle2":"Distribution of means from 10 rolls","samplingDistTitle3":"Distribution of means from 30 rolls","samplingDistTitle4":"Distribution of means from 100 rolls","pmfIntroBeforeCharts":"The **probability mass function (PMF)** is the probability $P(X=k)$ for each value $k$ of a discrete random variable. In a bar chart, the height of each bar is that probability, and the sum of all bar heights is 1. Below are three common distributions.","distNormal":"Normal","distPoisson":"Poisson","distPoissonSubtitle":"λ=1.5 (skewed right)","distPoissonShapeDesc":"Skewed to one side → ‘how many times’ an event occurred","distBinomial":"Binomial","distBinomialSubtitle":"n=10, p=0.5 (symmetric)","distBinomialShapeDesc":"Symmetric, peak in the center → ‘number of successes’ in n trials","distCompareHint":"Poisson: skewed (event count) · Binomial: symmetric, peak at center (success count)","figure2Title":"Figure 2: Discrete vs continuous","figure2Discrete":"Discrete (bars)","figure2Continuous":"Continuous (curve)","samplingDistAxisDice":"Dice face","samplingDistAxisProb":"Probability","samplingDistAxisSampleMean":"Sample mean","samplingDistAxisDensity":"Density","samplingDistMean":"Mean","samplingDistVariance":"Variance"},"mathVideoMeanVariance":{"chapter":"Chapter 11","title":"Mean and Variance: The Center and Spread of Distributions","description":"The mean (expected value) is the center of a distribution; variance measures spread. Used in AI for prediction, loss, and regularization.","sectionTitle":"What are mean and variance","whatIs":{"intro":"The **mean (expected value)** is the **center of mass** of a distribution. **Variance** measures how much values **spread** around the mean. **Standard deviation** is the square root of variance, so it shows “typical distance from the mean” in the **same units** as the data.","meanPlain":"**Mean** — e.g. die average (1+…+6)/6=3.5, exam class average, or demand forecast “expected value.” The red line in the figure is the mean $\\mu$.","variancePlain":"**Variance** — probability-weighted average of (value−mean)². Large variance ⇒ more spread. **Standard deviation $\\sigma=\\sqrt{\\text{variance}}$** brings spread back to the original units (points, kg, etc.): e.g. “mean 70, σ=10” means many scores lie roughly in 60–80.","whyBoth":"Knowing only the mean is risky—e.g. a river may have average depth 1 m but spots deeper than 3 m. **Variance** is what we need to manage that risk (volatility). In AI we don’t just output a prediction (mean); we also look at how much it can vary (variance) to measure **confidence**.","conceptsInAIIntro":"**Concepts often used in AI** — The table below summarizes mode, mean, min/max, and median: what they mean and how they are used in AI.","conceptsInAITable":"| Concept | Meaning | In AI |\n| --- | --- | --- |\n| **Mode** | The value with the highest probability; the outcome that appears most often in repeated trials. | Used when choosing the “most likely class” in classification; the argmax of softmax output is the mode. |\n| **Mean (expected value)** | The center of mass of the distribution; the sum of value×probability. It represents the “expected” value. | Used for regression predictions, loss (e.g. MSE), expected reward in reinforcement learning, and so on. |\n| **Min / Max** | The interval [min, max] in which the variable can lie; the smallest and largest values that define the range. | Used in loss minimization (gradient descent), value clipping, and setting normalization ranges. |\n| **Median** | The value in the middle when ordered by size. Unlike the mean, it is less affected by extreme values (outliers). | Used when summarizing data with many outliers or when a robust statistic is needed. |"},"whyImportant":{"prediction":"A **measure of prediction accuracy**. The number an AI outputs is usually the **expected value** of its probability distribution. If the variance of that prediction is large, we can interpret it as the model not being confident in its own prediction.","uncertainty":"It **quantifies uncertainty**. In autonomous driving or medical AI, “how certain” matters a lot. Using standard deviation $\\sigma$, we set **confidence intervals** (e.g. mean ± 2σ) and assess the risk of results falling outside that range, supporting safer decisions.","loss":"It is the **design principle of the loss function**. In regression, **MSE (mean squared error)** is the mean of squared differences between target and prediction — i.e. minimizing the **variance** of the error. So reducing variance is exactly how the model gets better.","regularization":"It is the **basis of normalization**. If the variance of weights grows too large, the model becomes oversensitive and **overfits**. Keeping or suppressing variance keeps the model stable and more general."},"howUsed":{"daily":"**Daily life** — Exam scores are reported as “mean 70, standard deviation 10” so you see **center** and **spread**. Same for height/weight distributions, demand forecasts (expected value ± error range), and quality control (spec ± σ).","regression":"**Regression** — The prediction is usually the **conditional expected value**: “average output given this input.” We minimize MSE (mean of squared errors), i.e. we minimize a kind of average.","classification":"**Classification** — The model outputs **probabilities** per class; we take the **mode** (the class with the highest probability) as the predicted class. The argmax of the softmax output does exactly that.","rl":"**Reinforcement learning** — Policies are evaluated using the **expected reward**. We learn to maximize “average future reward” for an action, which is an expectation.","insideMath":"**Math flow** — Ch10 defined expectation and variance; Ch11 practices computing them. Ch12 normal distribution is fully determined by mean $\\mu$ and standard deviation $\\sigma$."},"problemSolving":{"focus":"Discrete case: **mean** = sum of $\\text{value}\\times\\text{probability}$, **variance** = $E[X^2]-(E[X])^2$. With denominator 6, $6\\times\\text{mean}$ and $36\\times\\text{variance}$ are integers.","meanStep":"**Mean** — add $\\text{value}\\times\\text{probability}$. With denominator 6, $6\\times\\text{mean}$ is an integer.","varianceStep":"**Variance** — $E[X^2]$ minus $(\\text{mean})^2$. $36\\times\\text{variance}$ is an integer and easy to compute.","exampleIntro":"Below: compute $6\\times\\text{mean}$, $36\\times\\text{variance}$, mean (integer), mode, and cumulative numerator.","example1":"**Example.** Values 1,2,3 with probs $\\frac{1}{6}$, $\\frac{2}{6}$, $\\frac{3}{6}$ → $6\\times\\text{mean} = 1\\times1+2\\times2+3\\times3 = 14$.","example2":"**Example.** Same distribution: $36\\times\\text{variance} = 6\\sum_i (n_i x_i^2) - (\\sum_i n_i x_i)^2$."},"visualTitleMeanVariance":"Mean and variance","visualAxisP":"P(x)","visualAxisX":"x","visualCaption":"Bar heights show the probability of each value. The red line is the mean (μ)—the center of the distribution. The purple band shows the typical spread (μ±σ). The tallest bar is the mode—the most frequent value.","visualMeanLabel":"μ","visualSigmaBandLabel":"μ±σ","visualIntroShort":"Mean = center, variance = spread, σ = √variance."},"mathVideoUniformNormal":{"chapter":"Chapter 12","title":"Uniform and Normal Distributions: From Initialization to Prediction","description":"Uniform distribution spreads probability evenly over an interval; normal distribution is bell-shaped around the mean. Used in AI for initialization, noise, and priors.","sectionTitle":"Uniform & Normal distribution","whatIs":{"intro":"Many continuous data in the world follow a certain pattern. Understanding the two most basic—**uniform** and **normal** distributions—is a key step to grasping how AI works inside. The two measures from earlier chapters, **mean** ($\\mu$) and **variance** ($\\sigma^2$), are what shape these distributions.","uniformDef":"**Uniform distribution** — Every value in an interval $[a,b]$ has the **same probability**. The graph is a flat rectangle. Think of it as extending “each face of a die has equal chance” to a continuous scale. We use it when we want to give every possibility a **fair chance** with no bias.","uniformMeanVar":"The **mean** of a uniform distribution is the midpoint $(a+b)/2$. **Variance** is $(b-a)^2/12$, proportional to the square of the interval length. The wider the interval, the harder it is to predict the outcome (uncertainty grows), so variance grows too.","normalDef":"**Normal distribution** — A **bell-shaped (Bell-curve)** distribution symmetric about the mean. Heights, test scores, measurement error, and many natural phenomena follow it—hence the name “normal.” Also called Gaussian; **mean** ($\\mu$) is the peak, **standard deviation** ($\\sigma$) is the spread.","normalCurve":"The power of the normal distribution is the **empirical rule (68–95–99.7)**: about **68%** of data lie in $\\mu \\pm 1\\sigma$, about **95%** in $\\mu \\pm 2\\sigma$, and about **99.7%** in $\\mu \\pm 3\\sigma$. With this rule we can quickly see how far a value is from the mean (outlier or not) and assess **AI prediction confidence**.","whyTwo":"Uniform stands for **“we know nothing—blank slate”**; normal for **“a natural state with a mean as reference.”** AI initializes weights by spreading them uniformly, then uses the normal distribution to model the errors in data as it learns toward the answer."},"whyImportant":{"prior":"**Design of prior information**: In Bayesian statistics, the \"preconception\" that AI has before learning is called the prior distribution. When we want to start from a perfectly fair position we use the uniform distribution; when we have a reasonable guess that (the parameter) is near a certain mean we use the normal distribution to design the model’s basic strength.","noise":"**Mathematical modeling of error**: All data in the world contains noise. These noises occur independently, and when summed they end up following a normal distribution. When AI removes noise from photos or restores blurry audio, assuming the noise has a normal shape makes restoration much more accurate.","centralLimit":"**Central limit theorem**: This is the foundation of statistics. No matter what shape the data has, if we sample it many times and take the mean, the distribution of those means surprisingly approaches a **normal distribution**. Thanks to this, AI can predict the character of the whole population from a small sample by borrowing the normal distribution.","inAI":"In deep learning **weight initialization** can make or break training. Techniques like **Xavier** and **He initialization** finely adjust the variance of uniform or normal distributions so that the data signal is transmitted without distortion to the depths of the network."},"howUsed":{"init":"**Weight initialization** — If we set all weights to zero at the start, the network cannot learn. So we fill them with random numbers from a **uniform** or **normal** distribution. Using a normal with small variance keeps most weights near zero, so training starts more stably and quickly.","noise":"**Noise** — VAE samples the latent vector from a normal; diffusion models add and remove Gaussian noise step by step.","regression":"**Regression** — Assuming normal errors makes least squares (OLS) equivalent to **maximum likelihood**. Prediction intervals use $\\mu \\pm k\\sigma$.","bayesian":"**Bayesian** — Uniform or normal priors are common; after observing data we compute the **posterior**. Neural network weights can have normal priors.","insideMath":"**Math flow** — Ch10 random variables and distributions, Ch11 mean and variance, then Ch12 **two concrete distributions** (uniform and normal). Knowing these helps read 'initialization', 'noise', and 'prior' in AI papers."},"problemSolving":{"focus":"**Uniform** — On $[a,b]$, density $1/(b-a)$, mean $(a+b)/2$, variance $(b-a)^2/12$. **Normal** — Mean $\\mu$, variance $\\sigma^2$; interval probabilities from standard normal table or calculator.","uniformExample":"**Example (uniform).** On $[0,6]$, mean is $3$, variance $36/12=3$, standard deviation $\\sqrt{3}$.","normalExample":"**Example (normal).** For mean $70$ and standard deviation $10$, about 68% lie in $60$–$80$, about 95% in $50$–$90$."}},"mathSymbolPaletteTitle":"Math symbols","mathSymbolPaletteDescription":"View math symbols (Greek letters, operators, sets, etc.) with their names and pronunciation. Click to copy.","mathSymbolPaletteSearchPlaceholder":"Search by name or keyword (e.g. alpha, sigma, partial)","mathSymbolPaletteNoResults":"No results.","mathSymbolPaletteHint":"Click a symbol to copy to clipboard.","mathSymbolCategoryGreekLower":"Greek (lowercase)","mathSymbolCategoryGreekUpper":"Greek (uppercase)","mathSymbolCategoryOperators":"Operators","mathSymbolCategoryRelations":"Relations","mathSymbolCategoryArrows":"Arrows","mathSymbolCategorySets":"Sets & number systems","mathSymbolCategoryLogic":"Logic","mathSymbolCategoryCalculus":"Calculus","mathSymbolCategoryMisc":"Misc"}},"now":"$undefined","timeZone":"UTC","children":[["$","script",null,{"type":"application/ld+json","dangerouslySetInnerHTML":{"__html":"{\"@context\":\"https://schema.org\",\"@type\":\"WebSite\",\"name\":\"Mdoo AI\",\"alternateName\":[\"MDOOAI\",\"エムドゥAI\",\"Mdoo AI\",\"姆豆AI\"],\"description\":\"Free AI, deep learning & machine learning courses. Learn basic math, neural networks, backpropagation, KNN, regression, ensemble step by step with quizzes. AI for beginners—start here.\",\"url\":\"https://mdooai.com/en\",\"inLanguage\":[\"ko\",\"ja\",\"en\",\"zh-Hans\"],\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":\"https://mdooai.com/en/learn\"},{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https://mdooai.com/en/community?q={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}]}"}}],["$","script",null,{"type":"application/ld+json","dangerouslySetInnerHTML":{"__html":"{\"@context\":\"https://schema.org\",\"@type\":\"Organization\",\"name\":\"Mdoo AI\",\"url\":\"https://mdooai.com\",\"description\":\"Free AI, deep learning & machine learning courses. Learn basic math, neural networks, backpropagation, KNN, regression, ensemble step by step with quizzes. AI for beginners—start here.\",\"logo\":\"https://mdooai.com/app_icon.png\",\"sameAs\":[]}"}}],["$","div",null,{"className":"__className_f367f3 min-w-0 max-w-[100vw] overflow-x-hidden","children":[["$","$L23",null,{}],["$","$13",null,{"fallback":null,"children":["$","$L24",null,{"children":["$","$Lf",null,{"parallelRouterKey":"children","segmentPath":["children","$10","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L11",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]}]}]]}]]}]